def form_packet_totalbp(fastq_file, packet_size, fmt_func, max_seq_len): # Function reads lines from 'fastq_file' and composes a packet of 'packet_size' base pairs. # :param fastq_file: file instance from which to read; # :type fastq_file: _io.TextIOWrapper or gzip.File; # :param packet_size: number of base pairs to retrive from file; # :type packet_size: int; # :param fmt_func: formating functio nfrom FORMMATING_FUNCS tuple; # :param max_seq_len: maximum length of a sequence proessed; # :type max_seq_len: int (None if pruning is disabled); packet = "" qual_dict = dict() # {<seq_id>: <read_quality>} eof = False totalbp = 0 while totalbp < packet_size: try: read_id = fmt_func(fastq_file.readline()) except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(os.path.abspath(fastq_file.name))) printlog_warning("Ceasing reading sequences from this file.") eof = True break # end try if read_id == "": # if eof is reached, leave now eof = True break # end if read_id = fmt_read_id(read_id) seq = fmt_func(fastq_file.readline()) fastq_file.readline() # pass comment avg_qual = get_read_avg_qual( fmt_func(fastq_file.readline()) ) packet += read_id + '\n' + seq + '\n' qual_dict[read_id[1:]] = avg_qual totalbp += min(len(seq), max_seq_len) # end while if max_seq_len < float("inf"): # prune sequences packet = prune_seqs(packet, max_seq_len) # end if return {"fasta": packet, "qual": qual_dict}, eof
def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path, nfiles): # Function performs 'many_files'-parallel mode of barapost-local.py. # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed; # :type fq_fa_list: list<str>; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; # :param nfiles: total number of files; # :type nfiles: int; queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp") # Iterate over source FASTQ and FASTA files for fq_fa_path in fq_fa_list: # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join( new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data[ "n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data[ "tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum( 1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len( tuple( filter( lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: with print_lock: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format( os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end with # end try # end if if num_seqs == num_done_seqs: with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\ format(i, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") # end with continue # end if for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs): # Blast the packet align_xml_text = launch_blastn(packet["fasta"], blast_algorithm, use_index, queries_tmp_dir, db_path) # Cnfigure result TSV lines result_tsv_lines = parse_align_results_xml(align_xml_text, packet["qual"]) # Write the result to tsv write_classification(result_tsv_lines, tsv_res_path) # end for with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end with # end for query_fpath = os.path.join(queries_tmp_dir, "query{}_tmp.fasta".format(os.getpid())) remove_tmp_files(query_fpath)
def fasta_packets(fasta, packet_size, num_done_seqs, packet_mode=0, saved_packet_size=None, saved_packet_mode=None, max_seq_len=float("inf"), probing_batch_size=float("inf")): # Generator yields fasta-formattedpackets of records from fasta file. # This function passes 'num_done_seqs' sequences (i.e. they will not be processed) # to 'pass_processed_files'. # # :param fasta: path to fasta file; # :type fasta: str; # :param packet_size: number of sequences to align in one request ('blastn' launching); # :type packet_size: int; # :param num_done_seqs: number of sequnces in current file that have been already processed; # :type num_done_seqs: int; # :param packet_mode: packet mode (see -c option); # :type packet_mode: int; # :param saved_packet_size: size of last sent packet from tmp file. Necessary for resumption. # It will be None, if no tmp file was in classification directory; # :type saved_packet_size: int; # :param saved_packet_mode: mode used whilst formig the last sent packet from tmp file. # Necessary for resumption. It will be None, if no tmp file was in classification directory; # :type saved_packet_mode: int; # :param max_seq_len: maximum length of a sequence proessed; # :type max_seq_len: int (float("inf") if pruning is disabled); how_to_open = OPEN_FUNCS[is_gzipped(fasta)] fmt_func = FORMATTING_FUNCS[is_gzipped(fasta)] with how_to_open(fasta) as fasta_file: # Next line retrieving is implemented as simple line-from-file reading. get_next_line = lambda: fmt_func(fasta_file.readline()) # Variable that contains ID of next sequence in current FASTA file. # If no or all sequences in current FASTA file have been already processed, this variable is None. # There is no way to count sequences in multi-FASTA file, accept of counting sequence IDs. # Therefore 'next_id_line' should be saved in memory just after moment when packet is formed. next_id_line = pass_processed_seqs(fasta_file, num_done_seqs, fmt_func) if next_id_line == "": yield {"fasta": "", "qual": dict()} # end if packet = "" # We are resuming, nucleotide sequence will be saved in 'line' variable here: try: line = get_next_line() except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(fasta)) printlog_warning("Ceasing reading sequences from this file.") return # end try if line.startswith('>'): line = fmt_read_id(line) # format sequence ID # end if # If some sequences have been passed, this if-statement will be executed. # New packet should start with sequence ID line. if not next_id_line is None: packet += next_id_line + '\n' # end if packet += line + '\n' # add recently read line # Here goes check for saved packet size and mode: if not saved_packet_size is None: wrk_pack_size = saved_packet_size else: wrk_pack_size = packet_size # end if if not saved_packet_mode is None: wrk_pack_mode = saved_packet_mode else: wrk_pack_mode = packet_mode # end if eof = False while not eof: # till the end of file counter = 0 # variable for counting sequences within packet seqlen = 0 while counter < wrk_pack_size: try: line = get_next_line() except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(fasta)) printlog_warning( "Ceasing reading sequences from this file.") line = "" break # end try if line.startswith('>'): line = fmt_read_id(line) if packet_mode == 0: counter += 1 else: counter += min(seqlen, max_seq_len) seqlen = 0 # end if # end if if line == "": # if end of file (data) is reached break # end if if not line.startswith('>'): seqlen += len(line.strip()) # end if packet += line + '\n' # add line to packet # end while if line != "": next_id_line = packet.splitlines()[ -1] # save sequence ID next packet will start with packet = '\n'.join(packet.splitlines() [:-1]) # exclude 'next_id_line' from packet else: eof = True next_id_line = None # end if # Get list of sequence IDs: names = filter(lambda l: l.startswith('>'), packet.splitlines()) names = map(lambda l: l.replace('>', ''), names) # {<seq_id>: '-'}, as soon as it is a fasta file qual_dict = {name: '-' for name in names} if max_seq_len < float("inf"): # prune sequences packet = prune_seqs(packet, max_seq_len) # end if if packet != "": yield {"fasta": packet, "qual": qual_dict} if packet_mode == 0: probing_batch_size -= wrk_pack_size wrk_pack_size = min(packet_size, probing_batch_size) else: probing_batch_size -= len(qual_dict) # end if # Switch back to standart packet size # As Vorotos said, repeated assignment is the best check: if wrk_pack_mode != packet_mode: wrk_pack_mode = packet_mode # end if if not next_id_line is None: packet = next_id_line + '\n' # end if else: return
def configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path): # Function returns dictionary, where keys are sequence (i.e. sequences meant to be binned) IDs, # and values are corresponding hit names. # # :param tsv_res_fpath: path to current TSV file. Binning will be performed accorfing to this TSV file; # :type tsv_res_fpath: str; # :param sens: binning sensitivity; # :type sens: str; # :parm taxonomy_path: path to taxonomy file; # :type taxonomy_file: str; resfile_lines = dict() tax_dict = src.taxonomy.get_tax_dict(taxonomy_path) with open(tsv_res_fpath, 'r') as brpst_resfile: brpst_resfile.readline() # pass the head of the table line = brpst_resfile.readline().strip( ) # get the first informative line while line != "": splt = line.split('\t') read_name = sys.intern(splt[0]) hit_name = splt[1] hit_acc = splt[2] try: quality = float(splt[8]) # we will filter by quality except ValueError as verr: if splt[8] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. quality = splt[8] else: printlog_error_time("query quality parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: query_len = int(splt[3]) # we will filter by length except ValueError as verr: printlog_error_time("query length parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end try try: pident = float(splt[5]) # we will filter by identity except ValueError as verr: if splt[5] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. pident = splt[5] else: printlog_error_time( "Alignment percent of identity parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: coverage = float(splt[4]) # we will filter by coverage except ValueError as verr: if splt[4] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. coverage = splt[4] else: printlog_error_time("alignment coverage parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: resfile_lines[read_name] = [ format_taxonomy_name(hit_acc, hit_name, sens, tax_dict), quality, query_len, pident, coverage ] except NoTaxonomyError: printlog_warning( "Can't find taxonomy for reference sequence `{}`".format( hit_acc)) printlog_warning("Trying to recover taxonomy.") # Recover src.taxonomy.recover_taxonomy(hit_acc, hit_name, taxonomy_path) printlog_info("Taxonomy for {} is recovered.".format(hit_acc)) # Update tax_dict tax_dict = src.taxonomy.get_tax_dict(taxonomy_path) # Format again -- with new tax_dict resfile_lines[read_name] = [ format_taxonomy_name(hit_acc, hit_name, sens, tax_dict), quality, query_len, pident, coverage ] # end try line = brpst_resfile.readline().strip() # get next line # end while # end with return resfile_lines
def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path): # Function preforms "few_files"-parallel mode. # # :param fq_fa_list: list of paths to files meant to be processed; # :type fq_fa_list: list<str>; # :param n_thr: number of threads to launch; # :type n_thr: int; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; nfiles = len(fq_fa_list) for i, fq_fa_path in enumerate(fq_fa_list): # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ] fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end try # end if packet_size = min(packet_size, num_seqs // n_thr) if num_seqs == num_done_seqs: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\ .format(i+1, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") return # end if # Get number of seqeunces to pass to each thread file_part_size = num_seqs // n_thr if num_seqs % n_thr != 0: file_part_size += 1 # end if pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral, initargs=(mp.Lock(), mp.Lock(),)) pool.starmap(process_part_of_file, [(file_part, tsv_res_path, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path) for file_part in packet_generator(fq_fa_path, file_part_size, num_done_seqs)]) # Reaping zombies pool.close() pool.join() sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i+1, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end for
printlog_error_time(str(err)) platf_depend_exit(1) # end try # end if taxonomy_path = os.path.join(taxonomy_dir, "taxonomy.tsv") # Check if there is legacy taxonomy file and, if so, reformat it to new (TSV) format legacy_taxonomy_handling.check_deprecated_taxonomy(tax_annot_res_dir) from src.barapost_local_modules.build_local_db import build_local_db # Indexed discontiguous searches are not supported: # https://www.ncbi.nlm.nih.gov/books/NBK279668/#usermanual.Megablast_indexed_searches if use_index == "true" and blast_algorithm == "dc-megablast": printlog_warning( "Warning: BLAST index cannot be used in alignment algorithm is DiscoMegablast." ) printlog_warning("Index will be created anyway.") # end if # Build a database db_path = build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst, accs_to_download, use_index) if blast_algorithm == "dc-megablast": use_index = "false" # end if if use_index == "true" and len( glob(os.path.join(tax_annot_res_dir, "local_database", "*idx"))) == 0: printlog_warning(
def _get_related_replicons(acc, acc_dict): # Generator finds replicons (other chromosomes or plasmids, sometimes even proviruses), # which are related to Genbank record "discovered" by barapost-prober.py. # # :param acc: accession of a record "discovered" by barapost-prober.py; # :type acc: str; # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>}; # :type acc_dict: dict<str: tuple<str>>; # # Yields tuples of a following structure: # (<ACCESSION>, <RECORD_DEFINITION>) # We will save all titles in order not to duplicate records in our database repl_list = [(acc, acc_dict[acc])] # Elink utility returns links in DB_1, that are connected to given ID in DB_2 eutils_server = "eutils.ncbi.nlm.nih.gov" elink = "elink.fcgi" # = Find BioSample ID = # Configure URL nuc2biosmp_url = "/entrez/eutils/{}?dbfrom=nuccore&db=biosample&id={}".format( elink, acc) # Get XML with our links text_link_to_bsmp = lingering_https_get_request(eutils_server, nuc2biosmp_url, "BioSample page", acc) # Parse this XML root = ElementTree.fromstring(text_link_to_bsmp) linkset = next(iter(root.getchildren())).find("LinkSetDb") # XML should contain element "LinkSetDb" if linkset is None: printlog_warning( "Cannot check replicons for `{}`: there is no BioSample page for this record." .format(acc)) return list() # end if # Here we have BioSample ID biosmp_id = linkset.find("Link").find("Id").text # = Find assembly assotiated with this BioSample ID = # We will pass this BioSample ID through nuccore in order not to # allow requesting for over 7k transcripts, like for this fungus: # https://www.ncbi.nlm.nih.gov/biosample/SAMN07457167 # After this, only scaffolds (nearly 130 sequences) will be downloaded. # Configure URL biosmp2ass_url = "/entrez/eutils/{}?dbfrom=biosample&db=assembly&id={}".format( elink, biosmp_id) # Get XML with our links text_link_to_ass = lingering_https_get_request( eutils_server, biosmp2ass_url, "Assembly link assotiated with BioSample ID {}".format(biosmp_id)) # Parse this XML root = ElementTree.fromstring(text_link_to_ass) linkset = next(iter(root.getchildren())).find("LinkSetDb") # XML should contain element "LinkSetDb" if linkset is None: printlog_warning( """Cannot check replicons for `{}`: there is no assembly page for this record.""" .format(acc)) return list() # end if # Here we have BioSample ID ass_id = linkset.find("Link").find("Id").text # = Find GIs in nuccore assotiated with this Assembly ID = # Configure URL ass2nuc_url = "/entrez/eutils/{}?dbfrom=assembly&db=nuccore&id={}".format( elink, ass_id) # Get XML with our links text_link_to_nuc = lingering_https_get_request( eutils_server, ass2nuc_url, "Nucleotide links assotiated with assembly {}".format(ass_id)) # Parse this XML root = ElementTree.fromstring(text_link_to_nuc) linkset = next(iter(root.getchildren())).find("LinkSetDb") # XML should contain element "LinkSetDb" if linkset is None: printlog_error_time("""Cannot check replicons for `{}`: failed to find nuccore records for assembly {}.""".format(acc, ass_id)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # We will ntertain user -- show him/her this spinning thing (like conda does # indicating that the script is actually working. krutiolka = ('|', '/', '-', '\\') krut_i = 0 sys.stdout.write("\r {}".format(krutiolka[3])) sys.stdout.flush() # Collect links for elem in linkset.iter(): if elem.tag == "Id": # element "Id" contains our GI # Get GI, title and accession: rel_gi = elem.text rel_def, rel_acc = _get_record_title(rel_gi) # Print this spinning thing sys.stdout.write("\r {}".format(krutiolka[krut_i])) sys.stdout.flush() krut_i = krut_i + 1 if krut_i != 3 else 0 # If accession is new -- update list if not rel_acc in map(lambda x: x[0], repl_list): # acc_dict[rel_acc] = rel_def # update acc_dict repl_list.append((rel_acc, rel_def)) # end if # end if # end for return repl_list
def _ling_https_getreq_handl_301(server, url, request_for=None, acc=None): # Name stands for "Lingering Https Get Request Handling 301". # Function performs a "lingering" HTTPS request. # It means that the function tries to get the response # again and again if the request fails. # It handles 301-redirection in order to search for replicons related to "NC-records". # # :param server: server address; # :type server: str; # :param url: the rest of url; # :type url: str; # :param request_for: some comment for error message; # :type request_for: str; # :param acc: GenBank accession; # :type acc: str; # # Returns obtained response coded in UTF-8 ('str'). error = True while error: try: conn = http.client.HTTPSConnection(server, timeout=10) # create connection conn.request("GET", url) # ask for if there areresults response = conn.getresponse() # get the resonse # Handle redirection if response.code == 301: # Link to identical GenBank record is in "Location" header: redirect_url = response.getheader( "Location") + "?report=accnlist&log$=seqview&format=text" else: raise _DoesNotRedirectError( "NCBI does not redirect, although it must!") # end if except (OSError, http.client.RemoteDisconnected, socket.gaierror, http.client.CannotSendRequest) as err: comment_str = "" if not request_for is None: comment_str += " requesting for {}".format(request_for) if not acc is None: comment_str += " (accession: '{}')".format(acc) # end if comment_str += '.' # end if print() printlog_warning("Can't connect to `{}`{}".format( server + url, comment_str)) printlog_warning(str(err)) printlog_warning( "the program will sleep for 30 seconds and try to connect again." ) sleep(30) except _DoesNotRedirectError as err: printlog_error_time(str(err)) printlog_error("Please, contact the developer.") platf_depend_exit(1) else: error = False # if no exception ocured, get out of the loop finally: conn.close() # end try # end while # And here goes simple "lingering_https_get_request", # which will retrieve content from redirected location return lingering_https_get_request(server, redirect_url, request_for, acc)