def create_result_directory(fq_fa_path, outdir_path): # Function creates a result directory named according # to how source FASTQ or FASTA file is named. # # :param fq_fa_path: path to source FASTQ or FASTA file; # :type fq_fa_path: str; # :param outdir_path: path to directory in which result_directory will be created; # :type outdir_path: str; # # Returns 'str' path to the recently created result directory. # dpath means "directory path" new_dpath = os.path.join( outdir_path, os.path.basename(fq_fa_path)) # get rid of absolute path new_dpath = re.search(r"(.*)\.(m)?f(ast)?(a|q)(\.gz)?$", new_dpath).group(1) # get rid of extention if not os.path.exists(new_dpath): try: os.makedirs(new_dpath) except OSError as oserr: printlog_error_time( "Error: can't create result directory: `{}`".format(new_dpath)) printlog_error(str(oserr)) platf_depend_exit(1) # end try # end if return new_dpath
def provide_open_funcs(fpaths): # Function, which returns opening function(s) for input file(s). # # :param fpaths: collection of paths to input files; # :type fpaths: list<str>; open_funcs = list() try: for fpath in fpaths: # Check if input file is gzipped if _is_gzipped(fpath): open_funcs.append( functools.partial(gzip.open, mode='rt', encoding='utf-8')) # Check if input file is bzipped2 elif _is_bzipped(fpath): open_funcs.append( functools.partial(bz2.open, mode='rt', encoding='utf-8')) # Check if input file is plain text file elif _is_plain_text(fpath): open_funcs.append( functools.partial(open, mode='r', encoding='utf-8')) else: # Raise a super terrifying exception raise _InvalidFileError('Error: cannot read file `{}`: \ it is neither plain text file, nor gzipped, nor bzipped2.'.format(fpath)) # end if # end for except _InvalidFileError as err: printlog_error(str(err)) platf_depend_exit(1) # end try return open_funcs
def get_res_tsv_fpath(new_dpath): # Function returns current TSV file. Binning will be performed according to this file. # :param new_dpath: current result directory; # :type new_dpath: str; is_similar_to_tsv_res = lambda f: True if f == "classification.tsv" else False if not os.path.exists(new_dpath): printlog_error_time( "Error: directory `{}` does not exist!".format(new_dpath)) printlog_error("Please make sure you have performed taxonomic \ annotation of the following file: `{}` \ with `barapost-prober.py` and/or `barapost-local.py`".format( os.path.basename(new_dpath))) printlog_error( "Also this error might occur if you forget to specify result directory \ generated by `barapost-prober.py` with `-r` option.") platf_depend_exit(0) # end if # Recent file will be the first in sorted list tsv_res_fpath = list( filter(is_similar_to_tsv_res, sorted(os.listdir(new_dpath))))[0] return os.path.join(new_dpath, tsv_res_fpath)
def gzip_outfiles(outdir): # Function gzips all fastq files in directory `outdir`. # # :param outdir: path to outdir; # :type outdir: str; # Get gzipping function gzip_func = _get_gzip_func() print() printlog_info_time('Gzipping output files...') # Get fastq files is_fastq = lambda x: not re.match(r'.+\.f(ast)?q$', x) is None fq_fpaths = filter(is_fastq, glob.iglob(os.path.join(outdir, '*'))) # Gzip them! for fpath in fq_fpaths: try: gzip_func(fpath) except OSError as err: printlog_info('Error: cannot gzip file `{}`: {}.'.format( fpath, err)) platf_depend_exit(1) # end try # end for printlog_info_time('Output files are gzipped.')
def recover_taxonomy(acc, hit_def, taxonomy_path): # Function recovers missing taxonomy by given accession. # # :param acc: accession of taxonomy entry to recover; # :type acc: str; # :param hit_def: name of this sequence; # :type hit_def: sre; # :param taxonomy_path: path to TSV file with taxonomy; # :type taxonomy_path: str; if acc == "LAMBDA": # If we are missing lambda phage taxonomy -- just add it save_taxonomy_directly(taxonomy_path, acc, "Lambda-phage-nanopore-control") elif acc.startswith("OWN_SEQ_"): # If sequence is an "own seq" -- check fasta file # Get necessary title line from `local_seq_set.fasta` # Firstly find fasta file (it may be compressed) classif_dir = os.path.dirname(os.path.dirname(taxonomy_path)) db_dir = os.path.join(classif_dir, "local_database") db_files = glob.glob("{}{}*".format(db_dir, os.sep)) try: local_fasta = next(iter(filter(is_fasta, db_files))) except StopIteration: printlog_error_time( "Error: cannot recover taxonomy for following sequence:") printlog_error(" `{} - {}`.".format(acc, hit_def)) printlog_error( "You can solve this problem by yourself (it's pretty simple).") printlog_error("Just add taxonomy line for {} to file `{}`".format( acc, taxonomy_path)) printlog_error(" and run the program again.") platf_depend_exit(1) # end try # Find our line startingg with `acc` how_to_open = OPEN_FUNCS[is_gzipped(local_fasta)] fmt_func = FORMATTING_FUNCS[is_gzipped(local_fasta)] if is_gzipped(local_fasta): search_for = b">" + bytes(acc, 'ascii') + b" " else: search_for = ">{} ".format(acc) # end if with how_to_open(local_fasta) as fasta_file: for line in fasta_file: if line.startswith(search_for): seq_name = fmt_func(line).partition(' ')[ 2] # get name of the sequence save_taxonomy_directly(taxonomy_path, acc, seq_name) break # end if # end for # end with else: # Try to find taxonomy in NCBI download_taxonomy(acc, hit_def, taxonomy_path)
def _is_redundant(nc_acc, accs): # Function checks if "NC-or-NW"-record is redundant (if it's non-RefSeq copy already exists in acc_dict). # :param nc_acc: accession number of NC-record; # :type nc_acc: str; # :param accs: tuple of accession numbers; # :type accs: tuple<str>; summary = lingering_https_get_request( "www.ncbi.nlm.nih.gov", "/nuccore/{}?report=genbank&log$=seqview".format(nc_acc), "summary", nc_acc) try: # Find link to Identical GenBank Record # Firstly, get GI number of NC seqeunce: get_gi_url = "/nuccore/{}?report=gilist&log$=seqview&format=text".format( nc_acc) nc_gi_text = lingering_https_get_request("www.ncbi.nlm.nih.gov", get_gi_url, "GI of {}".format(nc_acc), nc_acc) nc_gi_text = nc_gi_text.replace('\n', '') nc_gi_re = re.search(r"\<pre\>([0-9]+).*\</pre\>", nc_gi_text) if nc_gi_re is None: raise _NoIdentLabelError( "Error 771. Accession: {}. Please, contact the developer.". format(nc_acc)) # end if nc_gi = nc_gi_re.group(1) # Retrieve identical GenBank sequence accession number. # NCBI redirects these requests and provides necessary location in headers. # So, we'll follow thin link. identical_gb_link = "/nuccore?LinkName=nuccore_nuccore_rsgb&from_uid={}".format( nc_gi) redirect_text = _ling_https_getreq_handl_301( "www.ncbi.nlm.nih.gov", identical_gb_link, "link to identical genbank sequence", nc_acc) # Get accession number from the response text pattern = r"\<pre\>(.*).*\</pre\>" ident_acc_re = re.search(pattern, redirect_text.replace('\n', '')) if ident_acc_re is None: raise _NoIdentLabelError( "Error 773. Accession: {}. Please, contact the developer.". format(nc_acc)) # end if ident_acc = ident_acc_re.group(1).partition('.')[0] except (_NoIdentLabelError, _NoLinkError, _NoAccError) as err: printlog_error_time("Error: {}".format(err)) platf_depend_exit(1) else: return ident_acc, ident_acc in accs
def create_or_emply_file(file_path): try: with open(file_path, 'wt') as _: pass # end with except OSError as err: print(f'\nError: cannot create file {file_path}') print(str(err)) platf_depend_exit(1)
def make_outdir(outdpath: str) -> None: # Function creates output directory. if not os.path.exists(outdpath): try: os.makedirs(outdpath) except OSError as err: print( 'Error: cannot create output directory `{}`.'.format(outdpath)) print(str(err)) platf_depend_exit(1)
def rename_file_verbosely(file): # Function verbosely renames file (as well as directory) given to it. # :param file: path to file (directory) meant to be renamed; # :type file: str; if not os.path.exists(file): return None # end if # Path to "file's" parent directory pardir = os.path.abspath(os.path.dirname(file)) # Function can rename directories if os.path.isdir(file): is_analog = lambda f: not re.search(r"{}.*(_old_[0-9]+)?$"\ .format(os.path.basename(file)), f) is None word = "directory" name_itself = file ext = "" else: is_analog = lambda f: re.search(r"(.*)\..*$", os.path.basename(file) ).group(1) in f word = "file" name_itself = re.search(r"(.*)\..*$", file).group(1) ext = re.search(r".*(\..*)$", file).group(1) # end if # Count files in 'pardir' that have analogous names as 'file' has: num_analog_files = len(list(filter(is_analog, os.listdir(pardir)))) if re.search(r"_old_[0-9]+", file) is None: # Append "_old_<number>" new_name = name_itself + "_old_" + str(num_analog_files) + ext else: # Merely substitute new number new_name = file.replace( re.search(r"_old_([0-9]+)", file).group(1), str(num_analog_files + 1)) # end if try: print() printlog_info(" - Renaming old {}:".format(word)) printlog_info(" `{}` --> `{}`".format(file, new_name)) os.rename(file, new_name) except OSError as err: printlog_error_time("Error: {} `{}` cannot be renamed:".format( word, str(file))) printlog_error(str(err)) platf_depend_exit(1) # end try return new_name
def remove_tmp_files(*paths): # Function removes files passed to it. # :param paths: an array-like collection of apth of files; # :type paths: list<str>; for path in paths: if os.path.exists(path): try: os.unlink(path) except OSError as oserr: printlog_error_time("Error: cannot remove file `{}`").format( path) printlog_error(str(oserr)) platf_depend_exit(1)
def update_file_dict(srt_file_dict, new_fpath): try: if not new_fpath is None: srt_file_dict[sys.intern(new_fpath)] = open(new_fpath, 'a') else: srt_file_dict[new_fpath] = None # handle no_trash # end if except OSError as oserr: printlog_error_time("Error occured while opening one of result files") printlog_error("Errorneous file: `{}`".format(new_fpath)) printlog_error(str(oserr)) platf_depend_exit(1) # end try return srt_file_dict
def _fasta_generator(infpath: str) -> Generator[Tuple[str, str], None, None]: # Generator yields "fasta-tuples: 0-th element of such a tuple is sequence name, # and 1-st element if sequence itself. curr_seq_name: str = '' # current sequence name curr_seq: str = '' # current sequence open_func: Callable[[str], ContextManager] # function for opening input file # Choose `open_func` if infpath.endswith('.gz'): open_func = partial(gzip.open, mode='rt', encoding='utf-8') else: open_func = partial(open, mode='rt', encoding='utf-8') # end if infile: TextIO with open_func(infpath) as infile: eof: bool = False # indicates of End Of File is reached # Get the first sequence name curr_seq_name = infile.readline().strip() while not eof: # Get next line whatever it is line: str = infile.readline().strip() if line.startswith('>') or line == '': # We reached end of current sequence # Validate parsed sequence try: _validate_fasta(curr_seq_name, curr_seq) except ValueError: platf_depend_exit(1) # end try yield curr_seq_name[1:], curr_seq # yield current sequence curr_seq_name = line # read next header curr_seq = '' # empty sequence if line == '': # no more sequences -- end of file eof = True # end if else: curr_seq += line.upper( ) # new line is a sequence -- append it to `curr_seq`
def _create_outdir_from_outfile(outfpath: str) -> None: # Function creates output directory # :param outfpath: path to output file; outdpath = os.path.dirname(outfpath) # Create directory if it does not exist if not os.path.isdir(outdpath): try: os.makedirs(outdpath) except OSError as err: print(f'Error! Cannot create output directory `{outdpath}`.') print(str(err)) platf_depend_exit(1)
def fastq_generator(fq_fpaths): # Function yields fastq records. # It does not create new FastqRecord object each time. # Instead it just updates extant object. # :param fq_fpaths: list ot paths to input fastq files; # :type fq_fpaths: list<str>, tuple<str>; # Yields list of FastqRecord-s, list<FastqRecord>. # Get open funtions for both files open_funcs = src.compression.provide_open_funcs(fq_fpaths) # Open input files and create FastqRecord objects for forward and reverse reads. fq_files = list() fq_records = list() for fpath, open_func in zip(fq_fpaths, open_funcs): fq_files.append(open_func(fpath)) fq_records.append(FastqRecord(None, None, None, None)) # end for eof = False while not eof: for fq_record, fq_file in zip(fq_records, fq_files): # Update FastqRecord fq_record.update_record(fq_file.readline().strip(), fq_file.readline().strip(), fq_file.readline().strip(), fq_file.readline().strip()) # end for if fq_records[0].read_name == '': eof = True # end of file else: # Validate fastq record(s) for fq_record in fq_records: error_response = fq_record.validate_fastq() if not error_response is None: printlog_error('Fastq error: {}'.format(error_response)) platf_depend_exit(1) # end if # end for yield fq_records # end if # end while # Close input files. for fq_file in fq_files: fq_file.close()
def _bname_no_fasta_ext(fpath: str) -> str: # Function removes fasta extention (with `.gz` one, if it it present) # Find the extention ext_match_obj: re.Match = re.search(r'.+(\.f(ast)?a(\.gz)?)$', os.path.basename(fpath)) # Remove it bname_no_ext: str if ext_match_obj is None: print('Error 12: please, contact the developer.') platf_depend_exit(12) else: bname_no_ext = os.path.basename(fpath).replace(ext_match_obj.group(1), '') # end if return bname_no_ext
def verify_taxids(taxid_list): # Funciton verifies TaxIDs passed to prober with `-g` option. # Function requests NCBI Taxonomy Browser and parses organism's name from HTML response. # What is more, this function configures `oraganisms` list - it will be included into BLAST submissions. # # :param taxid_list: list of TaxIDs. TaxIDs are strings, but they are verified to be integers # during CL argument parsing; # :type taxid_list: list<str>; # # Returns list of strings of the following format: "<tax_name> (taxid:<TaxID>)>" organisms = list() if len(taxid_list) > 0: printlog_info("Verifying TaxIDs:") for taxid in taxid_list: printn(" {} - ".format(taxid)) try: tax_resp = lingering_https_get_request( "www.ncbi.nlm.nih.gov", "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}".format( taxid), "taxonomy") tax_name = re.search(r"Taxonomy browser \((.+?)\)", tax_resp).group(1) except AttributeError: printlog_error("\aError: TaxID not found") printlog_error( "Please, check your TaxID: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi" ) platf_depend_exit(1) except OSError as oserr: printlog_error("Something is wrong with connection:") printlog_error(str(oserr)) platf_depend_exit(-2) else: print(tax_name) log_info("{} - {}".format(taxid, tax_name)) organisms.append("{} (taxid:{})".format(tax_name, taxid)) # end try # end for print('-' * 30 + '\n') # end if return organisms
def search_for_related_replicons(acc_dict): # Function searches for replicons related to those in 'hits_to_download.tsv' # of specified with '-s' option. # :param acc_dict: dictionary comntaining accession data of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; print() printlog_info_time("Searching for related replicons...") start_accs = tuple( acc_dict.keys()) # accessions, which were "discovered" by prober for i, acc in enumerate(start_accs): printlog_info("{}. {} ({}):".format(i + 1, acc, acc_dict[acc])) # Search for related replicons: try: related_repls = _get_related_replicons(acc, acc_dict) except AttributeError: printlog_errot_time( "Parsing error: cannot find replicons related to {}.".format( acc)) printlog_error("Please, contact the developer") platf_depend_exit(1) else: related_repls = _deduplicate_replicons(related_repls, acc) # end try for rel_acc, rel_def in related_repls: acc_dict[rel_acc] = rel_def # end for # end for print() if len(start_accs) != len(acc_dict): # there are some new replicons printlog_info_time("{} related replicons have been found.".\ format(len(acc_dict) - len(start_accs))) else: printlog_info_time("No related replicons found.") # end if print() # end def search_for_related_replicons
def whether_to_build_index(index_dirpath): # Function checks if there are any files in index directory. # If there are any, it asks a user whether to create a new index or to use old one. # :param index_dirpath: path to index directory; # :type index_dirpath: str; use_old_index = False if len(os.listdir(index_dirpath)) != 0: printlog_info( "Index file created by `-u` option already exists (left from previous run)." ) error = True while error: reply = input(""" Press ENTER to make new index file or enter 'u' to use old index file:>>""") if reply == "": try: for path in glob(os.path.join(index_dirpath, '*')): os.unlink(path) # end for except OSError as oserr: printlog_error_time( "Error: cannot remove old index files!") printlog_error(str(oserr)) platf_depend_exit(1) # end try error = False elif reply == 'u': use_old_index = True error = False else: print("Invalid reply!\n") # end if # end while printlog_info("You have chosen to {} index file.".format( "use old" if use_old_index else "make new")) print() # end if return use_old_index
def launch_blastn(packet, blast_algorithm, use_index, queries_tmp_dir, db_path): """ Function launches 'blastn' utility from "BLAST+" toolkit and returns it's response. :param pacekt: FASTA data meant to be processend by 'blastn'; :type packet: str; :param blast_algorithm: blastn algorithm to use; :type blast_algorithm: str; :param use_index: logic value inddicating whether to use index; :type use_index: bool: :param queries_tmp_dir: path to directory with query files; :type queries_tmp_dir: str: :param db_path: path to database; :type db_path: str: """ # PID of current process won't change, so we can use it to mark query files. # 'paket's are too large to pass them to 'subprocess.Popen' as stdin, # therefore we need to use these query files. query_path = os.path.join(queries_tmp_dir, "query{}_tmp.fasta".format(os.getpid())) with open(query_path, 'w') as query_file: query_file.write(packet) # end with # Configure command line blast_cmd = "blastn -query {} -db {} -outfmt 5 -task {} -max_target_seqs 10 -max_hsps 1 -use_index {}"\ .format(query_path, db_path, blast_algorithm, use_index) pipe = sp.Popen(blast_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) stdout_stderr = pipe.communicate() if pipe.returncode != 0: printlog_error_time( "Error occured while aligning a sequence against local database") printlog_error(stdout_stderr[1].decode("utf-8")) platf_depend_exit(pipe.returncode) # end if return stdout_stderr[0].decode("utf-8")
def make_outdir(outdir): # Funciton makes output directory. # Function warns a user if outdir is not empty. # # :aram outdir: path to outdir; # :type outdir: str; if not os.path.exists(outdir): # Create outdir if it doesn't exist. try: os.makedirs(outdir) except OSError as err: print('Cannot create output directory: {}'.format(err)) platf_depend_exit(1) # end try elif len(os.listdir(outdir)) != 0: # It outdir is not empty -- warn user and ask if he/she wants to empty it now. print('\nOutput directory `{}` is not empty.'.format(outdir)) error = True while error: reply = input("""Press ENTER to remove all files in it and proceed or enter `q` to exit\n >> """) if reply == '': error = False for fpath in glob.iglob(os.path.join(outdir, '*')): print('Removing `{}`'.format(fpath)) try: os.unlink(fpath) except OSError as err: print('Error. Cannot remove file `{}`: {}'\ .format(fpath, err)) # end try # end for elif reply.lower() == 'q': # Just exit sys.exit(0) else: print('Invalid reply: `{}`'.format(reply))
def _select_get_matches( term: str ) -> Callable[[MutableSequence[Overlap]], Collection[Overlap]]: # Function returns function depending on `term` (terminus) parameter. # If `term` is 's', it returns `_get_start_matches` function. # If `term` is 'e', it returns `_get_end_matches` function. # # :param term: string 's' (start) or 'e' (end); if term == 's': get_matches = _get_start_matches elif term == 'e': get_matches = _get_end_matches else: print('Fatal error: invalid value passed to function \ `_get_overlaps_str_for_table` with argument `term`: `{}`'.format(term)) print('Please, contact the developer.') platf_depend_exit(1) # end if return get_matches
def parse_args(version: str, last_update_date: str) -> Tuple[Sequence[str], Mapping[str, Any]]: # Function parses command line arguments. # Returns two values: # 1. Collection of paths to input files. # 2. Dictionary of parameters (see function _parse_options). # Print help message and exit if required if '-h' in sys.argv[1:] or '--help' in sys.argv[1:]: print_help(version, last_update_date) platf_depend_exit() # end if # Print version and exit if required if '-v' in sys.argv[1:] or '--version' in sys.argv[1:]: print(version) platf_depend_exit() # end if # Parse arguments woth getopt opts: List[List[str]] args: List[str] try: opts, args = getopt.gnu_getopt(sys.argv[1:], 'hvk:i:a:o:', ['help', 'version', 'k-mer=', 'mink=', 'maxk=', 'outdir=']) except getopt.GetoptError as err: print(str(err)) platf_depend_exit(2) # end try # Extract paths to input files from parsed arguments contigs_fpaths: Sequence[str] = _get_input_fpaths(args) # Extract optional parameters from parsed arguments params: Dict[str, Any] = _parse_options(opts) # Verify mink and maxk: if params['i'] > params['a']: if '-i' not in sys.argv[1:] and '--mink' not in sys.argv[1:]: params['i'] = params['a'] elif '-a' not in sys.argv[1:] and '--maxk' not in sys.argv[1:]: params['a'] = params['i'] else: print('Error: minimum length of a k-mer is greater than maximum length of a k-mer.') print('Values specified by you:') print('Minimum length of a k-mer: {}.'.format(params['i'])) print('Maximum length of a k-mer: {}.'.format(params['a'])) platf_depend_exit(1) # end if # end if return contigs_fpaths, params
def add_lambda_phage(local_fasta, taxonomy_path): # Function adds control sequence of nanopore lambda phase DNA-CS # to 'local_fasta'. # # :param local_fasta: path to file with reference sequences to be included in database; # :type local_fasta: str; # :param taxonomy_path: path to taxonomy file; # :type taxonomy_path: str; print() printlog_info_time("Adding lambda phage control sequence...") # sys.path[0] is directory containing the script that was used to invoke the Python interpreter. # We will use it to get path to file with lambda's sequence. lambda_fpath = os.path.join(os.path.dirname(sys.path[0]), "lambda_control", "nanopore_lambda_DNA-CS_control.fasta.gz") # Check file existance if not os.path.exists(lambda_fpath): printlog_error_time( "Error: cannot find lambda phage control sequence: '{}'".format( lambda_fpath)) platf_depend_exit(1) # end if # Read lambda's sequence with open_as_gzip(lambda_fpath, 'rb') as lambda_file: lambda_fasta = lambda_file.read() # end with # Write it to db fasta file with open(local_fasta, 'wb') as db_fasta_file: db_fasta_file.write(lambda_fasta) # end with # Save lambda's taxonomy taxonomy.save_taxonomy_directly(taxonomy_path, "LAMBDA", "Lambda-phage-nanopore-control") printlog_info_time(" ok")
def check_depencencies() -> None: # Function checks all necessary dependencies for the program version: str = None err_msg: str = None err_msg_list: List = list() # Init dependencies names and functions to check them dependencies: Sequence[str] = ('Biopython', 'samtools') check_funcitons: Sequence[Callable[[], Tuple[str, str]]] = (_check_biopython, _check_samtools) print('\nDependencies:') for dep_name, chech_func in zip(dependencies, check_funcitons): print(f'{dep_name}:', end='') version, err_msg = chech_func() # check the dependence # Append error message, if it exists if not err_msg is None: err_msg_list.append(err_msg) # end if print(f' version {version}') # end for # Print errors, if they occured if len(err_msg_list) != 0: print('Dependencies errors:') for err_msg in err_msg_list: print(f' - {err_msg}') # end for platf_depend_exit(1) # end if print('All dependencies are satisfied.\n')
def verify_cl_accessions(accs_to_download, acc_dict): # Function checks existance of GenBank records that correspond to accessions # specified with '-s' option. After checking the function fulills 'acc_fict'. # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>}; # :type acc_dict: dict<str: tuple<str>>; check_connection("https://www.ncbi.nlm.nih.gov/") printlog_info_time("Verifying `-s` accessions...") sys.stdout.write("0/{}".format(len(accs_to_download))) for i, acc in enumerate(accs_to_download): server = "eutils.ncbi.nlm.nih.gov" url = "/entrez/eutils/esummary.fcgi?db=nuccore&id={}".format(acc) text = lingering_https_get_request(server, url, "record's name", acc) name = re.search( r"\<Item Name=\"Title\" Type=\"String\"\>(.+)\</Item\>", text) if name is None: printlog_info( "Cannot find GenBank record with accession '{}'".format(acc)) platf_depend_exit(1) else: name = name.group(1) # end if acc_dict[acc] = name sys.stdout.write("\r{}/{}".format(i + 1, len(accs_to_download))) # end for print() printlog_info_time("OK.")
def get_primers_seqs(primers_fpath): # Function for for obtaining primer sequence(s). # If primer_fpath is None, it returns refault primers: # Illumina 16S V3-V4 primers. # Otherwise it parses primers from provided fasta file. # Use Illumina V3-V4 primers by dafault if primers_fpath is None: primers = ('CCTACGGGNGGCWGCAG', 'GACTACHVGGGTATCTAATCC') else: primers = list() # Get lines try: with open(primers_fpath, 'r') as primers_file: lines = primers_file.readlines() # end with except OSError as oserror: printlog_error('Error while reading file of primers: {}'\ .format(oserror)) platf_depend_exit(1) # end try # Remove blank lines lines = list(filter(lambda x: x != '', lines)) # There must be 1 or 2 primers in primers file. if len(lines) not in (2, 4): printlog_error('Error: invalid format of primers file.\ It should be single (2 lines at all) or "double" (4 lines at all) fasta file.\ Bu there are {} lines in your file.'.format(len(lines))) platf_depend_exit(1) # end if bases = 'AGCTUTYSWKMBDHVN' # Validate sequence(s). for i in range(1, len(lines), 2): seq = lines[i].strip().upper() if re.match(r'[{}]+'.format(bases), seq) is None: printlog_error('Error: invalid character in primer sequence.\ Here is invalid primer sequence: `{}`. Permitted characters: `{}`'\ .format(seq, bases)) platf_depend_exit(1) # end if primers.append(seq) # end for # end if return primers
def wait_for_align(rid, rtoe, pack_to_send, filename): # Function waits untill BLAST server accomplishes the request. # # :param rid: Request ID to wait for; # :type rid: str; # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request; # :type rtoe: int; # :param pack_to_send: current packet (id) number to send; # :type pack_to_send: int; # :param filename: basename of current FASTA file; # :type filename: str # # Returns XML response ('str'). print() print("Requesting for current query status. Request ID: {}".format(rid)) print(" `{}`; Submission #{}".format(filename, pack_to_send[0])) log_info("Requesting for current query status.") log_info("Request ID: {}; `{}`; Submission #{}".format( rid, filename, pack_to_send[0], )) # RTOE can be zero at the very beginning of resumption if rtoe > 0: printlog_info_time( "BLAST server estimates that alignment will be accomplished in {} seconds" .format(rtoe)) printlog_info_time( "Waiting for {}+3 (+3 extra) seconds...".format(rtoe)) # Server migth be wrong -- we will give it 3 extra seconds sleep(rtoe + 3) printlog_info_time( "{} seconds have passed. Checking if alignment is accomplished...". format(rtoe + 3)) # end if server = "blast.ncbi.nlm.nih.gov" wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid whtspc_len = 6 + len("(requesting)") while True: resp_content = lingering_https_get_request(server, wait_url, "BLAST response") # if server asks to wait if "Status=WAITING" in resp_content: printn("\r{} - The request is being processed. Waiting{}{}".format( getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len)) # indicate each 20 seconds with a dot for i in range(1, 7): sleep(10) printn( "\r{} - The request is being processed. Waiting{}".format( getwt(), '.' * i)) # end for printn("(requesting)") continue elif "Status=FAILED" in resp_content: # if job failed print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) elif "Status=UNKNOWN" in resp_content: # if job expired print() printlog_info_time("Job expired\a\n") printlog_info("Resending this packet.") return None, BlastError(1) # if results are ready elif "Status=READY" in resp_content: print() printlog_info("Result for query `{}` #{} is ready!".format( filename, pack_to_send[0])) # if there are hits if "ThereAreHits=yes" in resp_content: for i in range(15, 0, -5): print('-' * i) # end for print("-\nRetrieving results...") # Retrieve human-readable text and put it into result directory retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid txt_align_res = lingering_https_get_request( server, retrieve_text_url, "text version of BLAST response") # Count already existing plain text files in outdir: is_txt_response = lambda f: not re.search( r"prober_blast_response_[0-9]+\.txt", f) is None outdir_path = os.path.dirname(logging.getLoggerClass( ).root.handlers[0].baseFilename) # tricky trick response_num = len( tuple(filter(is_txt_response, os.listdir(outdir_path)))) # Curent txt response file will have number `response_num+1` txt_hpath = os.path.join( outdir_path, "prober_blast_response_{}.txt".format(response_num + 1)) # Write text result for a human to read with open(txt_hpath, 'w') as txt_file: txt_file.write(txt_align_res) # end with elif "ThereAreHits=no" in resp_content: # if there are no hits printlog_info_time("There are no hits. It happens.\n") else: # probably, job is failed if execution reaches here print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) # end if break # end if # Execution should not reach here printlog_error_time( "Fatal error (-122). Please contact the developer.\a\n") platf_depend_exit(-122) # end while # Retrieve XML result retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid xml_text = lingering_https_get_request(server, retrieve_xml_url, "XML BLAST response") if "Bad Gateway" in xml_text: print() printlog_info_time("Bad Gateway. Data from last packet has been lost.") printlog_info("Resending this packet.") return None, BlastError(1) elif "Status=FAILED" in xml_text: print() printlog_info_time("BLAST error: request failed") printlog_info("Resending this packet.") return None, BlastError(2) elif "to start it again" in xml_text: print() printlog_info_time("BLAST error") printlog_info("Resending this packet.") return None, BlastError(2) elif "[blastsrv4.REAL]" in xml_text: blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text) blastsrv4_str = "" if blastsrv4_match is None else ": {}".format( blastsrv4_match.group(1)) printlog_info_time("BLAST server error{}".format(blastsrv4_str)) # Error code 2 indicated that we need to split packet and resubmit return None, BlastError(2) # end if return xml_text, BlastError(0)
def send_request(request, pack_to_send, packet_size, packet_mode, filename, tmp_fpath): # Function sends a request to "blast.ncbi.nlm.nih.gov/blast/Blast.cgi" # and then waits for satisfaction of the request and retrieves response text. # # :param request: request_data (it is a dict that `configure_request()` function returns); # :param request: dict<dict>; # :param pack_to_send: current number (like id) of packet meant to be sent now. # :type pack_to_send: int; # :param pack_to_send: ordinal number of packet; # :type pack_to_send: int; # :param packet_size: numner of sequences in the packet; # :type packet_size: int; # # Returns XML text of type 'str' with BLAST response. payload = request["payload"] headers = request["headers"] server = "blast.ncbi.nlm.nih.gov" url = "/blast/Blast.cgi" error = True while error: try: conn = http.client.HTTPSConnection(server) # create a connection conn.request("POST", url, payload, headers) # send the request response = conn.getresponse() # get the response response_text = str(response.read(), "utf-8") # get response text except OSError as oserr: printlog_info_time( "`https://blast.ncbi.nlm.nih.gov` is not available.") printlog_info(str(oserr)) printlog_info( "barapost will try to connect again in 30 seconds...\n") sleep(30) # if no exception occured else: error = False # end try # end while try: rid = re.search(r"RID = (.+)", response_text).group(1) # get Request ID rtoe = int(re.search(r"RTOE = ([0-9]+)", response_text).group( 1)) # get time to wait provided by the NCBI server except AttributeError: printlog_error_time("Seems, NCBI has denied your request.") printlog_error("Response is in file `request_denial_response.html`") with open("request_denial_response.html", 'w') as den_file: den_file.write(response_text) # end with platf_depend_exit(1) finally: conn.close() # end try # Save temporary data with open(tmp_fpath, 'w') as tmpfile: tmpfile.write("Request_ID: {}\n".format(rid)) tmpfile.write("Packet_size: {}\n".format(packet_size)) tmpfile.write("Packet_mode: {}".format(packet_mode)) # end with # Wait for results of alignment return wait_for_align(rid, rtoe, pack_to_send, filename)
def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for parallel binning FASTQ and FASTA files. # Actually bins multiple files. # # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_lst: list<str>; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage for fq_fa_path in fq_fa_lst: new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # Create an iterator that will yield records seq_records_iterator = iter(seq_records_generator(fq_fa_path)) # Dict for storing batches of sequences meant to be written to output files: to_write = dict() stop = False # for outer while-loop while not stop: # Extract batch of records of 'n_thr' size and find their destination paths: for _ in range(n_thr): try: fastqa_rec = next(seq_records_iterator) except StopIteration: stop = True # for outer while-loop break # end try read_name = sys.intern(fmt_read_id( fastqa_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error( "Make sure that this read has been already processed by \ `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # If read is found in TSV file: if not QL_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, QL_trash_fpath) QL_seqs_fail += 1 elif not align_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, align_trash_fpath) align_seqs_fail += 1 else: for hit_name in hit_names.split("&&"): # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format( hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) to_write[read_name] = (fastqa_rec, binned_file_path) # end for seqs_pass += 1 # end if # end for # Write batch of records to output files: with write_lock: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end with to_write.clear() # end while with write_lock: # Write the rest of 'uneven' data to output files: if len(to_write) != 0: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end if sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") # end with # end for return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def lingering_https_get_request(server, url, request_for=None, acc=None): # Function performs a "lingering" HTTPS request. # It means that the function tries to get the response # again and again if the request fails. # # :param server: server address; # :type server: str; # :param url: the rest of url; # :type url: str; # :param request_for: some comment for error message; # :type request_for: str; # :param acc: GenBank accession; # :type acc: str; # # Returns obtained response coded in UTF-8 ('str'). error = True # We can get spurious 404 or sth due to instability of NCBI servers work. # Let's give it 3 attempts (with 15 sec spans in between), # and if all them are unsuccessful -- teminate execution. attempt_i = 0 max_attempts = 3 while error: try: conn = http.client.HTTPSConnection(server, timeout=30) # create connection conn.request("GET", url) # ask for if there areresults response = conn.getresponse() # get the resonse if response.code != 200: if attempt_i < max_attempts and "ncbi.nlm.nih.gov" in server: printlog_error("Error {}: {}.".format( response.code, response.reason)) printlog_error( "It may be due to instable work of NCBI servers.") printlog_error("{} attempts to connect left, waiting 15 sec..."\ .format(max_attempts - attempt_i)) attempt_i += 1 else: printlog_error("Cannot find {} for {}.".format( request_for, acc)) printlog_error("Request failed with status code {}: {}"\ .format(response.code, response.reason)) platf_depend_exit(1) # end if # end if resp_content = str(response.read(), "utf-8") # get response text except (OSError,\ http.client.RemoteDisconnected,\ socket.gaierror,\ http.client.CannotSendRequest) as err: comment_str = "" if not request_for is None: comment_str += " requesting for {}".format(request_for) if not acc is None: comment_str += " (accession: `{}`)".format(acc) # end if comment_str += '.' # end if print() printlog_info("Can't connect to `{}`{}".format( server + url, comment_str)) printlog_info(str(err)) printlog_info( """the program will sleep for 30 seconds and try to connect again.""" ) sleep(30) else: error = False # if no exception ocured, get out of the loop finally: conn.close() # end try # end while return resp_content