def download_waiter(stop_wait): """ Function waits untill 'local_fasta' file is downloaded. It prints size of downloaded data to console during downloading. This function just waits -- it won't bring you the menu :). """ # Wait untill downloading starts while not os.path.exists(tmp_fasta): if not stop_wait.is_set(): return # end if sleep(1) # end while MB_size = 1024**2 # we will divide by it to get megabytes while stop_wait.is_set(): # Get size of downloaded data fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) # get megabytes printn("\r{} - {} MB downloaded ".format(getwt(), fsize)) sleep(1) # instant updates are not necessary # end while # Print total size of downloaded file (it can be deleted by this time) try: fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) except OSError: # We can pass this ecxeption -- we do delete this file if downloading crushes # And this function just waits :) pass # end try printlog_info("\r{} - {} MB downloaded ".format( getwt(), fsize))
def ask_for_resumption(): # Function asks a user if he/she wants to resume the previous run. # Returns True if the decision is to resume, else False resume = None while resume is None: resume = input(""" Would you like to resume the previous run? 1 -- Resume! 2 -- Start from the beginning. Enter a number (1 or 2):>> """) # Check if entered value is integer number. If no, give another attempt. try: resume = int(resume) # Check if input number is 1 or 2 if resume != 1 and resume != 2: print("\n Not a VALID number entered!\a\n" + '~' * 20) resume = None else: action = "resume the previous run" if resume == 1 else "start from the beginning" printlog_info("You have chosen to {}.".format(action)) print() # end if except ValueError: print("\nNot an integer number entered!\a\n" + '~' * 20) resume = None # end try return True if resume == 1 else False
def gzip_outfiles(outdir): # Function gzips all fastq files in directory `outdir`. # # :param outdir: path to outdir; # :type outdir: str; # Get gzipping function gzip_func = _get_gzip_func() print() printlog_info_time('Gzipping output files...') # Get fastq files is_fastq = lambda x: not re.match(r'.+\.f(ast)?q$', x) is None fq_fpaths = filter(is_fastq, glob.iglob(os.path.join(outdir, '*'))) # Gzip them! for fpath in fq_fpaths: try: gzip_func(fpath) except OSError as err: printlog_info('Error: cannot gzip file `{}`: {}.'.format( fpath, err)) platf_depend_exit(1) # end try # end for printlog_info_time('Output files are gzipped.')
def gzip_with_shutil(fpath): # Funtion for gzipping using Python funtionality. printlog_info('Gzipping `{}`'.format(fpath)) with open(fpath, 'rb') as plain_file, gzip.open(fpath + '.gz', 'wb') as gz_file: shutil.copyfileobj(plain_file, gz_file) # end with os.unlink(fpath)
def rename_file_verbosely(file): # Function verbosely renames file (as well as directory) given to it. # :param file: path to file (directory) meant to be renamed; # :type file: str; if not os.path.exists(file): return None # end if # Path to "file's" parent directory pardir = os.path.abspath(os.path.dirname(file)) # Function can rename directories if os.path.isdir(file): is_analog = lambda f: not re.search(r"{}.*(_old_[0-9]+)?$"\ .format(os.path.basename(file)), f) is None word = "directory" name_itself = file ext = "" else: is_analog = lambda f: re.search(r"(.*)\..*$", os.path.basename(file) ).group(1) in f word = "file" name_itself = re.search(r"(.*)\..*$", file).group(1) ext = re.search(r".*(\..*)$", file).group(1) # end if # Count files in 'pardir' that have analogous names as 'file' has: num_analog_files = len(list(filter(is_analog, os.listdir(pardir)))) if re.search(r"_old_[0-9]+", file) is None: # Append "_old_<number>" new_name = name_itself + "_old_" + str(num_analog_files) + ext else: # Merely substitute new number new_name = file.replace( re.search(r"_old_([0-9]+)", file).group(1), str(num_analog_files + 1)) # end if try: print() printlog_info(" - Renaming old {}:".format(word)) printlog_info(" `{}` --> `{}`".format(file, new_name)) os.rename(file, new_name) except OSError as err: printlog_error_time("Error: {} `{}` cannot be renamed:".format( word, str(file))) printlog_error(str(err)) platf_depend_exit(1) # end try return new_name
def check_deprecated_taxonomy(classif_dir): legacy_tax_path = os.path.join(classif_dir, "taxonomy", "taxonomy") if not os.path.exists(legacy_tax_path): pass # continue silently else: print() printlog_info("Legacy taxonomy file detected: `{}`.".format(legacy_tax_path)) printlog_info("It will be reformatted to new format -- to plain TSV.") _reformat_legacy_file(legacy_tax_path)
def search_for_related_replicons(acc_dict): # Function searches for replicons related to those in 'hits_to_download.tsv' # of specified with '-s' option. # :param acc_dict: dictionary comntaining accession data of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; print() printlog_info_time("Searching for related replicons...") start_accs = tuple( acc_dict.keys()) # accessions, which were "discovered" by prober for i, acc in enumerate(start_accs): printlog_info("{}. {} ({}):".format(i + 1, acc, acc_dict[acc])) # Search for related replicons: try: related_repls = _get_related_replicons(acc, acc_dict) except AttributeError: printlog_errot_time( "Parsing error: cannot find replicons related to {}.".format( acc)) printlog_error("Please, contact the developer") platf_depend_exit(1) else: related_repls = _deduplicate_replicons(related_repls, acc) # end try for rel_acc, rel_def in related_repls: acc_dict[rel_acc] = rel_def # end for # end for print() if len(start_accs) != len(acc_dict): # there are some new replicons printlog_info_time("{} related replicons have been found.".\ format(len(acc_dict) - len(start_accs))) else: printlog_info_time("No related replicons found.") # end if print() # end def search_for_related_replicons
def verify_taxids(taxid_list): # Funciton verifies TaxIDs passed to prober with `-g` option. # Function requests NCBI Taxonomy Browser and parses organism's name from HTML response. # What is more, this function configures `oraganisms` list - it will be included into BLAST submissions. # # :param taxid_list: list of TaxIDs. TaxIDs are strings, but they are verified to be integers # during CL argument parsing; # :type taxid_list: list<str>; # # Returns list of strings of the following format: "<tax_name> (taxid:<TaxID>)>" organisms = list() if len(taxid_list) > 0: printlog_info("Verifying TaxIDs:") for taxid in taxid_list: printn(" {} - ".format(taxid)) try: tax_resp = lingering_https_get_request( "www.ncbi.nlm.nih.gov", "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}".format( taxid), "taxonomy") tax_name = re.search(r"Taxonomy browser \((.+?)\)", tax_resp).group(1) except AttributeError: printlog_error("\aError: TaxID not found") printlog_error( "Please, check your TaxID: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi" ) platf_depend_exit(1) except OSError as oserr: printlog_error("Something is wrong with connection:") printlog_error(str(oserr)) platf_depend_exit(-2) else: print(tax_name) log_info("{} - {}".format(taxid, tax_name)) organisms.append("{} (taxid:{})".format(tax_name, taxid)) # end try # end for print('-' * 30 + '\n') # end if return organisms
def whether_to_build_index(index_dirpath): # Function checks if there are any files in index directory. # If there are any, it asks a user whether to create a new index or to use old one. # :param index_dirpath: path to index directory; # :type index_dirpath: str; use_old_index = False if len(os.listdir(index_dirpath)) != 0: printlog_info( "Index file created by `-u` option already exists (left from previous run)." ) error = True while error: reply = input(""" Press ENTER to make new index file or enter 'u' to use old index file:>>""") if reply == "": try: for path in glob(os.path.join(index_dirpath, '*')): os.unlink(path) # end for except OSError as oserr: printlog_error_time( "Error: cannot remove old index files!") printlog_error(str(oserr)) platf_depend_exit(1) # end try error = False elif reply == 'u': use_old_index = True error = False else: print("Invalid reply!\n") # end if # end while printlog_info("You have chosen to {} index file.".format( "use old" if use_old_index else "make new")) print() # end if return use_old_index
def verify_cl_accessions(accs_to_download, acc_dict): # Function checks existance of GenBank records that correspond to accessions # specified with '-s' option. After checking the function fulills 'acc_fict'. # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>}; # :type acc_dict: dict<str: tuple<str>>; check_connection("https://www.ncbi.nlm.nih.gov/") printlog_info_time("Verifying `-s` accessions...") sys.stdout.write("0/{}".format(len(accs_to_download))) for i, acc in enumerate(accs_to_download): server = "eutils.ncbi.nlm.nih.gov" url = "/entrez/eutils/esummary.fcgi?db=nuccore&id={}".format(acc) text = lingering_https_get_request(server, url, "record's name", acc) name = re.search( r"\<Item Name=\"Title\" Type=\"String\"\>(.+)\</Item\>", text) if name is None: printlog_info( "Cannot find GenBank record with accession '{}'".format(acc)) platf_depend_exit(1) else: name = name.group(1) # end if acc_dict[acc] = name sys.stdout.write("\r{}/{}".format(i + 1, len(accs_to_download))) # end for print() printlog_info_time("OK.")
def gzip_with_gnu_gzip(fpath): # Funtion for gzipping with GNU gzip. printlog_info('Gzipping `{}`'.format(fpath)) os.system('{} {}'.format(gzip_util, fpath))
def wait_for_align(rid, rtoe, pack_to_send, filename): # Function waits untill BLAST server accomplishes the request. # # :param rid: Request ID to wait for; # :type rid: str; # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request; # :type rtoe: int; # :param pack_to_send: current packet (id) number to send; # :type pack_to_send: int; # :param filename: basename of current FASTA file; # :type filename: str # # Returns XML response ('str'). print() print("Requesting for current query status. Request ID: {}".format(rid)) print(" `{}`; Submission #{}".format(filename, pack_to_send[0])) log_info("Requesting for current query status.") log_info("Request ID: {}; `{}`; Submission #{}".format( rid, filename, pack_to_send[0], )) # RTOE can be zero at the very beginning of resumption if rtoe > 0: printlog_info_time( "BLAST server estimates that alignment will be accomplished in {} seconds" .format(rtoe)) printlog_info_time( "Waiting for {}+3 (+3 extra) seconds...".format(rtoe)) # Server migth be wrong -- we will give it 3 extra seconds sleep(rtoe + 3) printlog_info_time( "{} seconds have passed. Checking if alignment is accomplished...". format(rtoe + 3)) # end if server = "blast.ncbi.nlm.nih.gov" wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid whtspc_len = 6 + len("(requesting)") while True: resp_content = lingering_https_get_request(server, wait_url, "BLAST response") # if server asks to wait if "Status=WAITING" in resp_content: printn("\r{} - The request is being processed. Waiting{}{}".format( getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len)) # indicate each 20 seconds with a dot for i in range(1, 7): sleep(10) printn( "\r{} - The request is being processed. Waiting{}".format( getwt(), '.' * i)) # end for printn("(requesting)") continue elif "Status=FAILED" in resp_content: # if job failed print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) elif "Status=UNKNOWN" in resp_content: # if job expired print() printlog_info_time("Job expired\a\n") printlog_info("Resending this packet.") return None, BlastError(1) # if results are ready elif "Status=READY" in resp_content: print() printlog_info("Result for query `{}` #{} is ready!".format( filename, pack_to_send[0])) # if there are hits if "ThereAreHits=yes" in resp_content: for i in range(15, 0, -5): print('-' * i) # end for print("-\nRetrieving results...") # Retrieve human-readable text and put it into result directory retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid txt_align_res = lingering_https_get_request( server, retrieve_text_url, "text version of BLAST response") # Count already existing plain text files in outdir: is_txt_response = lambda f: not re.search( r"prober_blast_response_[0-9]+\.txt", f) is None outdir_path = os.path.dirname(logging.getLoggerClass( ).root.handlers[0].baseFilename) # tricky trick response_num = len( tuple(filter(is_txt_response, os.listdir(outdir_path)))) # Curent txt response file will have number `response_num+1` txt_hpath = os.path.join( outdir_path, "prober_blast_response_{}.txt".format(response_num + 1)) # Write text result for a human to read with open(txt_hpath, 'w') as txt_file: txt_file.write(txt_align_res) # end with elif "ThereAreHits=no" in resp_content: # if there are no hits printlog_info_time("There are no hits. It happens.\n") else: # probably, job is failed if execution reaches here print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) # end if break # end if # Execution should not reach here printlog_error_time( "Fatal error (-122). Please contact the developer.\a\n") platf_depend_exit(-122) # end while # Retrieve XML result retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid xml_text = lingering_https_get_request(server, retrieve_xml_url, "XML BLAST response") if "Bad Gateway" in xml_text: print() printlog_info_time("Bad Gateway. Data from last packet has been lost.") printlog_info("Resending this packet.") return None, BlastError(1) elif "Status=FAILED" in xml_text: print() printlog_info_time("BLAST error: request failed") printlog_info("Resending this packet.") return None, BlastError(2) elif "to start it again" in xml_text: print() printlog_info_time("BLAST error") printlog_info("Resending this packet.") return None, BlastError(2) elif "[blastsrv4.REAL]" in xml_text: blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text) blastsrv4_str = "" if blastsrv4_match is None else ": {}".format( blastsrv4_match.group(1)) printlog_info_time("BLAST server error{}".format(blastsrv4_str)) # Error code 2 indicated that we need to split packet and resubmit return None, BlastError(2) # end if return xml_text, BlastError(0)
def ngmerge_runner(args): # Runner function for NGmerge task. # # :param args: arguments for NGmerge task; # :type args: NGmergeArguments; # # Returns two collections: # 1. A collection of valid ("merged") paths. # 2. A collection of trash ("unmerged") paths. print() printlog_info_time('Running NGmerge..') # NGmerge puts result files into working directory -- # we will temporarily go to output directory old_dir = os.getcwd() os.chdir(args.outdir) # Conigure output files' names merged_basename, unmerged_prefix = ofn.get_ngmerge_outprefixes( args.infpaths[0]) # Configure command ngmerge_cmd = '{} -1 {} -2 {} -o {} -f {} -n {} -v -m {} -p {} -q {}'\ .format(args.ngmerge, args.infpaths[0], args.infpaths[1], merged_basename, unmerged_prefix, args.n_thr, args.min_overlap, args.mismatch_frac, args.phred_offset) printlog_info('Command: `{}`'.format(ngmerge_cmd)) # Run NGmerge print('NGmerge is doing it\'s job silently...') pipe = sp.Popen(ngmerge_cmd, shell=True, stderr=sp.PIPE) stderr = pipe.communicate()[1].decode('utf-8') # run NGmerge if pipe.returncode != 0: # error printlog_error('Error running NGmerge.: {}'.format(stderr)) platf_depend_exit(pipe.returncode) # end if # Parse merging statistics from NGmerge's stderr stderr = stderr.splitlines()[1:] reads_pattern = r'Fragments \(pairs of reads\) analyzed: ([0-9]+)' merged_pattern = r'Successfully stitched: ([0-9]+)' # Collect statistics try: reads_processed = int(re.search(reads_pattern, stderr[0]).group(1)) merged_reads = int(re.search(merged_pattern, stderr[1]).group(1)) except (ValueError, AttributeError) as err: printlog_error( 'Error 78 ({}). Please, contact the developer.'.format(err)) platf_depend_exit(78) # end try os.chdir(old_dir) # return to old dir printlog_info_time('NGmerge merged {}/{} ({}%) read pairs.'\ .format(merged_reads, reads_processed, round(merged_reads / reads_processed * 100, 2))) # Configure absolute paths to output files. merged_fpath = os.path.join(args.outdir, merged_basename) unmerged_fpaths = sorted( glob.glob( os.path.join(args.outdir, '{}*.fastq'.format(unmerged_prefix)))) # Oh yeah, first returned value must be a collection. return [merged_fpath], unmerged_fpaths
def send_request(request, pack_to_send, packet_size, packet_mode, filename, tmp_fpath): # Function sends a request to "blast.ncbi.nlm.nih.gov/blast/Blast.cgi" # and then waits for satisfaction of the request and retrieves response text. # # :param request: request_data (it is a dict that `configure_request()` function returns); # :param request: dict<dict>; # :param pack_to_send: current number (like id) of packet meant to be sent now. # :type pack_to_send: int; # :param pack_to_send: ordinal number of packet; # :type pack_to_send: int; # :param packet_size: numner of sequences in the packet; # :type packet_size: int; # # Returns XML text of type 'str' with BLAST response. payload = request["payload"] headers = request["headers"] server = "blast.ncbi.nlm.nih.gov" url = "/blast/Blast.cgi" error = True while error: try: conn = http.client.HTTPSConnection(server) # create a connection conn.request("POST", url, payload, headers) # send the request response = conn.getresponse() # get the response response_text = str(response.read(), "utf-8") # get response text except OSError as oserr: printlog_info_time( "`https://blast.ncbi.nlm.nih.gov` is not available.") printlog_info(str(oserr)) printlog_info( "barapost will try to connect again in 30 seconds...\n") sleep(30) # if no exception occured else: error = False # end try # end while try: rid = re.search(r"RID = (.+)", response_text).group(1) # get Request ID rtoe = int(re.search(r"RTOE = ([0-9]+)", response_text).group( 1)) # get time to wait provided by the NCBI server except AttributeError: printlog_error_time("Seems, NCBI has denied your request.") printlog_error("Response is in file `request_denial_response.html`") with open("request_denial_response.html", 'w') as den_file: den_file.write(response_text) # end with platf_depend_exit(1) finally: conn.close() # end try # Save temporary data with open(tmp_fpath, 'w') as tmpfile: tmpfile.write("Request_ID: {}\n".format(rid)) tmpfile.write("Packet_size: {}\n".format(packet_size)) tmpfile.write("Packet_mode: {}".format(packet_mode)) # end with # Wait for results of alignment return wait_for_align(rid, rtoe, pack_to_send, filename)
def lingering_https_get_request(server, url, request_for=None, acc=None): # Function performs a "lingering" HTTPS request. # It means that the function tries to get the response # again and again if the request fails. # # :param server: server address; # :type server: str; # :param url: the rest of url; # :type url: str; # :param request_for: some comment for error message; # :type request_for: str; # :param acc: GenBank accession; # :type acc: str; # # Returns obtained response coded in UTF-8 ('str'). error = True # We can get spurious 404 or sth due to instability of NCBI servers work. # Let's give it 3 attempts (with 15 sec spans in between), # and if all them are unsuccessful -- teminate execution. attempt_i = 0 max_attempts = 3 while error: try: conn = http.client.HTTPSConnection(server, timeout=30) # create connection conn.request("GET", url) # ask for if there areresults response = conn.getresponse() # get the resonse if response.code != 200: if attempt_i < max_attempts and "ncbi.nlm.nih.gov" in server: printlog_error("Error {}: {}.".format( response.code, response.reason)) printlog_error( "It may be due to instable work of NCBI servers.") printlog_error("{} attempts to connect left, waiting 15 sec..."\ .format(max_attempts - attempt_i)) attempt_i += 1 else: printlog_error("Cannot find {} for {}.".format( request_for, acc)) printlog_error("Request failed with status code {}: {}"\ .format(response.code, response.reason)) platf_depend_exit(1) # end if # end if resp_content = str(response.read(), "utf-8") # get response text except (OSError,\ http.client.RemoteDisconnected,\ socket.gaierror,\ http.client.CannotSendRequest) as err: comment_str = "" if not request_for is None: comment_str += " requesting for {}".format(request_for) if not acc is None: comment_str += " (accession: `{}`)".format(acc) # end if comment_str += '.' # end if print() printlog_info("Can't connect to `{}`{}".format( server + url, comment_str)) printlog_info(str(err)) printlog_info( """the program will sleep for 30 seconds and try to connect again.""" ) sleep(30) else: error = False # if no exception ocured, get out of the loop finally: conn.close() # end try # end while return resp_content
def look_around(outdir_path, new_dpath, infile_path, blast_algorithm, acc_dict, probing_batch_size): # Function looks around in order to check if there are results from previous run(s) of this script # in order to resume the previous run. # # Returns None if there is no result from previous run. # If there are results from previous run, returns a dict of the following structure: # { # "RID": saved_RID <str>, # "packet_size_save": saved packet size <int>, # "packet_size_mode": saved packet mode <int>, # "tsv_respath": path_to_tsv_file_from_previous_run <str>, # "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file <int>, # "tmp_fpath": path_to_pemporary_file <str>, # "decr_pb": valuse decreasing size of probing batch (see below, where this variable is defined) <int> # } # # :param outdir_path: path to output directory; # :type outdir_path: str; # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory; # :type new_dpath: str; # :param infile_path: path to current (corresponding to fq_fa_path file) FASTA file; # :type infile_path: str; # :param blast_algorithm: BLASTn algorithm to use. # This parameter is necessary because it is included in name of result files; # :param acc_dict: dictionary of accession info of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; # :param probing_batch_size: amount of sequences meant to be processed in a single run; # :type probing_batch_size: str; # :type blast_algorithm: str; # "hname" means human readable name (i.e. without file path and extention) fasta_hname = os.path.basename(infile_path) # get rid of absolute path fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group( 1) # get rid of `.fasta` extention # Form path to temporary file tmp_fpath = "{}_{}_temp.txt".format(os.path.join(new_dpath, fasta_hname), blast_algorithm) # Form path to result file tsv_res_fpath = os.path.join(new_dpath, "classification.tsv") # Form path to file with hits to download acc_fpath = os.path.join(outdir_path, "hits_to_download.tsv") num_done_seqs = 0 # variable to keep number of successfully processed sequences resume = None # Check if there are results from previous run. if os.path.exists(tsv_res_fpath) or os.path.exists(tmp_fpath): print() printlog_info( "A result file from previous run is found in the directory:") printlog_info(" `{}`".format(new_dpath)) # Allow politely to continue from last successfully sent packet. resume = ask_for_resumption() # end if if not resume: rename_file_verbosely(tsv_res_fpath) rename_file_verbosely(tmp_fpath) rename_file_verbosely(acc_fpath) else: printlog_info("Let's try to resume...") # Collect information from result file if os.path.exists(tsv_res_fpath): # There can be invalid information in this file try: with open(tsv_res_fpath, 'r') as res_file: lines = res_file.readlines() num_done_seqs = len(lines) - 1 # the first line is a head last_line = lines[-1] last_seq_id = last_line.split('\t')[0] # end with # There must be 10 columns in each row: if any(map(lambda l: l.count('\t') != 9, lines)): raise ValueError( "There must be 10 colums separated by tabs in file `classification.tsv`" ) # end if except Exception as err: printlog_error_time( "\nData in classification file `{}` not found or broken. Reason:" .format(tsv_res_fpath)) printlog_error(' ' + str(err)) # If the reason is known -- print erroneous lines if isinstance(err, ValueError): printlog_error("Here are numbers of improper lines:") for i, line in enumerate(lines): if line.count('\t') != 9: printlog_error(str(i + 1) + ": `{}`".format(line)) # end if # end for # end if # Ask a user if he/she wants to start from the beginning or to quit error = True while error: reply = input("""Press ENTER to start from the beginning or enter `q` to quit:>> """) if reply == "": error = False printlog_info( "You have chosen to start from the beginning.\n") rename_file_verbosely(tsv_res_fpath) rename_file_verbosely(tmp_fpath) rename_file_verbosely(acc_fpath) return None elif reply == 'q': platf_depend_exit(0) else: print("! - Invalid reply: `{}`\n".format(reply)) # end if # end while else: printlog_info("Last classified sequence: " + last_seq_id) printlog_info( "{} sequences have been already processed".format( num_done_seqs)) # end try # end if # Collect information from accession file if os.path.exists(acc_fpath): # There can be invalid information in this file try: with open(acc_fpath, 'r') as acc_file: lines = acc_file.readlines()[ 9:] # omit description and head of the table local_files_filtered = list( filter(lambda x: False if os.path.exists(x) else True, lines)) # omit file paths for line in local_files_filtered: vals = line.split('\t') acc = sys.intern(vals[0].strip()) if len(vals) == 1: acc_dict[acc] = [ "No definition of the sequence provided", 1 ] elif len(vals) == 2: acc_dict[acc] = [vals[1].strip(), 1] else: acc_dict[acc] = [ vals[1].strip(), int(vals[2].strip()) ] # end if # end for # end with except Exception as err: printlog_error_time( "Data in accession file `{}` not found or broken. Reason:". format(acc_fpath)) printlog_error(' ' + str(err)) printlog_error("Invalid line: `{}`".format(line)) # Ask a user if he/she wants to start from the beginning or to quit error = True while error: reply = input("""Press ENTER to start from the beginning or enter `q` to quit:>> """) if reply == "": error = False printlog_info( "You have chosen to start from the beginning.\n") rename_file_verbosely(tsv_res_fpath) rename_file_verbosely(tmp_fpath) rename_file_verbosely(acc_fpath) return None elif reply == 'q': platf_depend_exit(0) else: print("! - Invalid reply: `{}`\n".format(reply)) # end if # end while else: print() printlog_info( "Here are Genbank records encountered during previous run(s):" ) for acc, other_info in sorted(acc_dict.items(), key=lambda x: -x[1][1]): s_letter = "s" if other_info[1] > 1 else "" printlog_info(" {} hit{} - {}, `{}`".format( other_info[1], s_letter, acc, other_info[0])) # end for print('-' * 20) # end try # end if # Get packet size, number of the last sent packet and RID from temp file. # There can be invalid information in tmp file of tmp file may not exist try: with open(tmp_fpath, 'r') as tmp_file: temp_lines = tmp_file.readlines() # end with RID_save = re.search(r"Request_ID: (.+)", temp_lines[0]).group(1).strip() packet_size_save = int( re.search(r"Packet_size: ([0-9]*)", temp_lines[1]).group(1).strip()) packet_mode_save = int( re.search(r"Packet_mode: ([0-9]{1})", temp_lines[2]).group(1).strip()) except (AttributeError, OSError): # There is no need to disturb a user, merely proceed. return { "RID": None, "packet_size_save": None, "packet_mode_save": None, "tsv_respath": tsv_res_fpath, "n_done_reads": num_done_seqs, "tmp_fpath": tmp_fpath, "decr_pb": 0 } else: # Let's assume that a user won't modify his/her brobing_batch size between erroneous runs: # subtract num_done_reads if probing_batch_size > num_done_reads. decr_pb = num_done_seqs if num_done_seqs < probing_batch_size else 0 # Return data from previous run return { "RID": RID_save, "packet_size_save": packet_size_save, "packet_mode_save": packet_mode_save, "tsv_respath": tsv_res_fpath, "n_done_reads": num_done_seqs, "tmp_fpath": tmp_fpath, "decr_pb": decr_pb } # end try # end if return None
def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path): # Function preforms "few_files"-parallel mode. # # :param fq_fa_list: list of paths to files meant to be processed; # :type fq_fa_list: list<str>; # :param n_thr: number of threads to launch; # :type n_thr: int; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; nfiles = len(fq_fa_list) for i, fq_fa_path in enumerate(fq_fa_list): # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ] fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end try # end if packet_size = min(packet_size, num_seqs // n_thr) if num_seqs == num_done_seqs: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\ .format(i+1, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") return # end if # Get number of seqeunces to pass to each thread file_part_size = num_seqs // n_thr if num_seqs % n_thr != 0: file_part_size += 1 # end if pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral, initargs=(mp.Lock(), mp.Lock(),)) pool.starmap(process_part_of_file, [(file_part, tsv_res_path, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path) for file_part in packet_generator(fq_fa_path, file_part_size, num_done_seqs)]) # Reaping zombies pool.close() pool.join() sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i+1, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end for
def _reformat_legacy_file(legacy_tax_path): import shelve # Check if this file is corrupted try: with shelve.open(legacy_tax_path, 'r') as tax_file: pass # end with except OSError as err: printlog_error("Legacy taxonomy file appears to be corrupted.") printlog_error("This error might be fatal.") str_err = str(err) if "dbm.gnu" in str_err and "module is not" in str_err: printlog_error("Installing `python3-gdbm` might solve this problem.") else: printlog_error("The program can't recover taxonomy from the broken file.") printlog_error("Seems, you have to annotate your sequences again.") printlog_error("Sorry for that :(") # end if platf_depend_exit(1) # end try new_tax_path = "{}.tsv".format(legacy_tax_path) taxonomy.init_tax_file(new_tax_path) printn("Reformatting: `{}` ->".format(legacy_tax_path)) log_info("Reformatting: `{}` ->".format(legacy_tax_path)) with shelve.open(legacy_tax_path, 'r') as old_tax_file, open(new_tax_path, 'w') as new_tax_file: for acc, taxonomy_from_file in old_tax_file.items(): if isinstance(taxonomy_from_file, tuple): tax_str = taxonomy.config_taxonomy_str(taxonomy_from_file) new_tax_file.write("{}\n".format('\t'.join( (acc, tax_str) ))) elif isinstance(taxonomy_from_file, str): new_tax_file.write("{}\n".format('\t'.join( (acc, taxonomy_from_file) ))) else: # Execution must not reach here printlog_error_time("Fatal error 8755.") printlog_error("Please, contact the developer.") platf_depend_exit(8755) # end if # end for # end with printlog_info(" `<same_dir>/{}`".format(os.path.basename(new_tax_path))) try: renamed_legacy_file = "{}_deprecated".format(legacy_tax_path) os.rename(legacy_tax_path, renamed_legacy_file) except OSError as err: printlog_error_time("Cannot rename legacy taxonomy file `{}`:".format(legacy_tax_path)) printlog_error(str(err)) printlog_error("But it's not a problem -- we will proceed with our work.") else: printlog_info("Renamed: `{}` -> `<same_dir>/{}`".format(legacy_tax_path, os.path.basename(renamed_legacy_file))) # end try printlog_info("Legacy taxonomy file is reformatted to TSV format.")
def retrieve_fastas_by_acc(acc_dict, db_dir, local_fasta): # Function downloads set of records from Genbank according to accessions passed to it. # Downloaded FASTA file will be placed in 'db_dir' directory and named 'local_seq_set.fasta' # :param acc_dict: dictionary comntaining accession data of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; # :param db_dir: path to directory in which downloaded FASTA file will be placed; # :type db_dir: str; # :param local_fasta: path to file with reference sequences to be included in database; # :type local_fasta: str; # Path to file with current chunk (see below "100 accession numbers...") tmp_fasta = os.path.join(db_dir, "tmp.fasta") accessions = tuple(set(acc_dict.keys())) if len(accessions) == 0: # just in case return # end if # 100 accession numbers in order not to make too long URL # Download genomes by chunks of 100 sequences. max_accnum = 100 i = 0 accnum = len(accessions) while i < accnum: curr_accessions = accessions[i:i + max_accnum] # slice chunk accs_del_comma = ','.join( curr_accessions) # accessions must be separated by comma in url # E-utilities provide a possibility to download records from Genbank by accessions. retrieve_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?\ db=nuccore&id={}&rettype=fasta&retmode=text".format(accs_del_comma) log_info("Retrieve URL: `{}`".format(retrieve_url)) # GNU wget utility is safer, but there can be presence of absence of it :) wget_util = "wget" util_found = False for d in os.environ["PATH"].split(os.pathsep): if os.path.isdir(d) and wget_util in os.listdir(d): util_found = True break # end if # end for print() printlog_info("{} - Downloading {} reference sequences...".format( getwt(), len(curr_accessions))) if util_found: # If we have wget -- just use it wget_cmd = 'wget --no-check-certificate "{}" -O {}'.format( retrieve_url, tmp_fasta) pipe = sp_Popen(wget_cmd, shell=True) pipe.communicate() if pipe.returncode != 0: printlog_error_time( "Error occured while downloading reference sequences") platf_depend_exit(pipe.returncode) # end if else: # If there are no wget -- we will download sequences with Python disposal stop_wait = Event( ) # a flag variable that will signal waiter-function to stop executing def download_waiter(stop_wait): """ Function waits untill 'local_fasta' file is downloaded. It prints size of downloaded data to console during downloading. This function just waits -- it won't bring you the menu :). """ # Wait untill downloading starts while not os.path.exists(tmp_fasta): if not stop_wait.is_set(): return # end if sleep(1) # end while MB_size = 1024**2 # we will divide by it to get megabytes while stop_wait.is_set(): # Get size of downloaded data fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) # get megabytes printn("\r{} - {} MB downloaded ".format(getwt(), fsize)) sleep(1) # instant updates are not necessary # end while # Print total size of downloaded file (it can be deleted by this time) try: fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) except OSError: # We can pass this ecxeption -- we do delete this file if downloading crushes # And this function just waits :) pass # end try printlog_info("\r{} - {} MB downloaded ".format( getwt(), fsize)) # end def download_waiter error = True while error: try: waiter = Thread(target=download_waiter, args=(stop_wait, )) # create thread stop_wait.set() # raise the flag waiter.start() # start waiting urllib.request.urlretrieve( retrieve_url, tmp_fasta) # retrieve FASTA file except OSError as err: printlog_error_time( "Error occured while downloading fasta file.") printlog_error(str(err)) printlog_error( "`barapost-local.py` will try again in 30 seconds") if os.path.exists(tmp_fasta): os.unlink(tmp_fasta) # end if sleep(30) else: error = False finally: stop_wait.clear() # lower the flag waiter.join( ) # main thread will wait until waiter function ends it's work # end try # end while # end if printlog_info_time("Downloading is completed") # Write chunk to result fasta file with open(tmp_fasta, 'r') as infile, open(local_fasta, 'a') as outfile: outfile.write(infile.read()) # end with # Remove temp chunk file os.unlink(tmp_fasta) i += max_accnum # go to next chunk
check_connection("https://blast.ncbi.nlm.nih.gov") print("|=== barapost-prober.py (version {}) ===|\n".format(__version__)) log_info("barapost-prober.py (version {})".format(__version__)) print(get_full_time() + "- Start working\n") log_info("Start working.") from src.prober_modules.prober_spec import look_around from src.prober_modules.networking import verify_taxids from src.prober_modules.kernel import submit, retrieve_ready_job # Make sure that TaxIDs specified by user actually exist organisms = verify_taxids(taxid_list) # Print information about the run printlog_info(" - Output directory: `{}`;".format(outdir_path)) printlog_info(" - Logging to `{}`".format( logging.getLoggerClass().root.handlers[0].baseFilename)) if user_email != "": printlog_info(" - Your email: <{}>".format(user_email)) # end if printlog_info(" - Probing batch size: {} sequences;".format( "all" if send_all else probing_batch_size)) mode_comment = "number of sequences" if packet_mode == 0 else "sum of sequences' lengths" printlog_info(" - Packet forming mode: {} ({});".format( packet_mode, mode_comment)) del mode_comment if packet_mode == 0: tmp_str = "sequences"
def _get_record_title(record_id): # Function retrieves title (aka definition) and accession # of a GenBank record by given accession or GI number. # :param record_id: accession or GI number of the record; # :type record_idi: str; # Returns tuple of two elements: # (<RECORD_TITLE>, <RECORD_ACCESSION>) # We'll use E-utilities to communicate with GenBank eutils_server = "eutils.ncbi.nlm.nih.gov" esummary = "esummary.fcgi" # utility name # Configure URL url = "/entrez/eutils/{}?db=nuccore&id={}".format(esummary, record_id) # Sometimes (I never figured out why) this XML arrives empty, and StopIteration emerges. # So, if we just repeat this request, everything is going to be ok. error = True print_ok = False while error: # Send the request and get the response summary = lingering_https_get_request( eutils_server, url, "e-summary of nuccore record {}".format(record_id)) # Parse XML that we've got root = ElementTree.fromstring(summary) # Elements of our insterest are all named "Item", # but they have different tags. # They are children of element "DocSum", which is # the first child of root try: docsum = next(iter(root.getchildren())) except StopIteration: print() printlog_info_time( "Failed to retrieve data for record {}. Trying again...". format(record_id)) print_ok = True # print this "ok" only after successful attepmt after fail else: if print_ok: printlog_info("ok") # end if error = False # end try # end while record_title = None record_acc = None # Search for title and accession for item in docsum.iter("Item"): if item.attrib["Name"] == "Title": record_title = item.text elif item.attrib["Name"] == "AccessionVersion": # Remove version just in case record_acc = re.search(r"(.*)\.[0-9]+", item.text).group(1) # end if # end for if record_title is None or record_acc is None: printlog_erro_time( "Error 8989: can't access e-summary for `{}`".format(record_acc)) platf_depend_exit(1) # end if return record_title, record_acc
def _split_and_resubmit(packet, packet_size, packet_mode, pack_to_send, seqs_processed, fq_fa_path, tmp_fpath, taxonomy_path, tsv_res_path, acc_fpath, blast_algorithm, user_email, organisms, acc_dict, out_of_n): # :param packet: "packet" dictionary described in "barapost-prober.py" before the kernel loop: # :type packet: dict; # :param packet_size: size of the packet (see option `-c` for definition); # :type packet_size: int; # :param packet_mode: packet forming mode (see option `-c` for definition); # :type packet_mode: int; # :param pack_to_send: ordinal number of packet to send # (it is list rather that in because it should be mutable); # :type pack_to_send: list<int>; # :param seqs_processed: nuber of sequnces processed # (it is list rather that in because it should be mutable); # :type seqs_processed: list<int>; # :param fq_fa_path: path to current input file; # :type fq_fa_path: str; # :param tmp_fpath: path to current temporary file; # :type tmp_fpath: str; # :param taxonomy_path: path to taxonomt file; # :type taxonomy_path: str; # :param tsv_res_path: path to current classification file; # :type tsv_res_path: str; # :param acc_fpath: path to file `hits_to_download.tsv`; # :type acc_fpath: str; # :param blast_algorithm: BLAST algorithm to use (see option `-a`); # :type blast_algorithm: str; # :param user_email: user email ot send with request; # :type user_email: str; # :param organisms: list of strings performing `nt` database slices; # :type organisms: list<str>; # :param acc_dict: accession dictionary for writing to `hits_to_download.tsv`; # :type acc_dict: dict<str: (str, int)>; # :param out_of_n: dictionary for printing how many packets left; # :type out_of_n: dict<str: str, str: int>; # Number of sequnces in packet to be splitted: pack_len = len(packet["qual"]) if pack_len > 1: # Split current packet into two (of equal number of sequences) and resubmit them one-by-one printlog_info( "Splitting current packet into two and submitting each of them one-by-one." ) # Update this dictionary to print how many packets left if not out_of_n["npacks"] is None: out_of_n["npacks"] += 1 out_of_n["msg"] = " out of {}".format(out_of_n["npacks"]) # end if # Calculate size of subpacket new_pack_size_0 = pack_len // 2 if pack_len % 2 != 0: new_pack_size_0 += 1 # end if # Split the packet for _, splitted_packet in enumerate( fasta_packets_from_str(packet["fasta"], new_pack_size_0)): # Inherit quality information from "ancestor" qual_dict for query_name in splitted_packet["qual"].keys(): splitted_packet["qual"][query_name] = packet["qual"][ query_name] # end for # Submit subpacket submit(splitted_packet, new_pack_size_0, 0, pack_to_send, seqs_processed, fq_fa_path, tmp_fpath, taxonomy_path, tsv_res_path, acc_fpath, blast_algorithm, user_email, organisms, acc_dict, out_of_n) # end for else: # Prune the only sequence in packet and resend it printlog_info("Current packet contains only one sequence.") printlog_info( "prober will prune this sequence twofold and resubmit it.") # Calculate new length for this sequence # Generator of stripped sequence-sontaining lines: old_seq = map(str.strip, packet["fasta"].splitlines()[1:]) old_len = len(''.join(old_seq)) # calculate length of old sequence new_len = old_len // 2 if old_len % 2 != 0: new_len += 1 # end if packet["fasta"] = prune_seqs(packet["fasta"], new_len) submit(packet, packet_size, packet_mode, pack_to_send, seqs_processed, fq_fa_path, tmp_fpath, taxonomy_path, tsv_res_path, acc_fpath, blast_algorithm, user_email, organisms, acc_dict, out_of_n)
for fpath in fq_fa_list: # Validate new_dpath existance for FASTA and FASTQ files: if not os.path.isdir(get_curr_res_dpath(fpath, tax_annot_res_dir)): printlog_error_time( "Error: Directory that should have contained results of taxonomic annotation \ for following file does not exist: `{}`.".format(os.path.basename(fpath))) printlog_error( "Please, make sure that this file have been already processed \ by `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end if # end for sys.stdout.write('\r') printlog_info("Primary validation...ok") print() is_fastQA5 = lambda f: not re.search(r".*\.(m)?f(ast)?(a|q|5)(\.gz)?$", f ) is None # Check if there are some results in output directory if len(list(filter(is_fastQA5, os.listdir(outdir_path)))) != 0: printlog_info( "Attention! Output directory `{}` is not empty!".format(outdir_path)) printlog_info("List of sequence-containing files in it:") for i, file in enumerate(filter(is_fastQA5, os.listdir(outdir_path))): printlog_info(" {}. `{}`".format(i + 1, file)) # end for print()
def configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path): # Function returns dictionary, where keys are sequence (i.e. sequences meant to be binned) IDs, # and values are corresponding hit names. # # :param tsv_res_fpath: path to current TSV file. Binning will be performed accorfing to this TSV file; # :type tsv_res_fpath: str; # :param sens: binning sensitivity; # :type sens: str; # :parm taxonomy_path: path to taxonomy file; # :type taxonomy_file: str; resfile_lines = dict() tax_dict = src.taxonomy.get_tax_dict(taxonomy_path) with open(tsv_res_fpath, 'r') as brpst_resfile: brpst_resfile.readline() # pass the head of the table line = brpst_resfile.readline().strip( ) # get the first informative line while line != "": splt = line.split('\t') read_name = sys.intern(splt[0]) hit_name = splt[1] hit_acc = splt[2] try: quality = float(splt[8]) # we will filter by quality except ValueError as verr: if splt[8] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. quality = splt[8] else: printlog_error_time("query quality parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: query_len = int(splt[3]) # we will filter by length except ValueError as verr: printlog_error_time("query length parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end try try: pident = float(splt[5]) # we will filter by identity except ValueError as verr: if splt[5] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. pident = splt[5] else: printlog_error_time( "Alignment percent of identity parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: coverage = float(splt[4]) # we will filter by coverage except ValueError as verr: if splt[4] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. coverage = splt[4] else: printlog_error_time("alignment coverage parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: resfile_lines[read_name] = [ format_taxonomy_name(hit_acc, hit_name, sens, tax_dict), quality, query_len, pident, coverage ] except NoTaxonomyError: printlog_warning( "Can't find taxonomy for reference sequence `{}`".format( hit_acc)) printlog_warning("Trying to recover taxonomy.") # Recover src.taxonomy.recover_taxonomy(hit_acc, hit_name, taxonomy_path) printlog_info("Taxonomy for {} is recovered.".format(hit_acc)) # Update tax_dict tax_dict = src.taxonomy.get_tax_dict(taxonomy_path) # Format again -- with new tax_dict resfile_lines[read_name] = [ format_taxonomy_name(hit_acc, hit_name, sens, tax_dict), quality, query_len, pident, coverage ] # end try line = brpst_resfile.readline().strip() # get next line # end while # end with return resfile_lines
format='%(levelname)s: %(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, filemode='w') log_info(sys.platform) log_info(sys.implementation) log_info(sys.version) print("|=== barapost-local.py (version {}) ===|\n".format(__version__)) log_info("barapost-local.py (version {})".format(__version__)) print(get_full_time() + "- Start working\n") log_info("Start working.") # |===== Proceed =====| printlog_info(" - Logging to `{}`".format( logging.getLoggerClass().root.handlers[0].baseFilename)) printlog_info(" - Output directory: `{}`;".format(tax_annot_res_dir)) printlog_info(" - Packet size: {} sequences;".format(packet_size)) printlog_info(" - BLAST algorithm: {};".format(blast_algorithm)) printlog_info(" - Threads: {};".format(n_thr)) print() s_letter = '' if len(fq_fa_list) == 1 else 's' printlog_info(" {} file{} will be processed.".format(len(fq_fa_list), s_letter)) if len(fq_fa_list) != 1: log_info("Here they are:") else: log_info("Here it is:") # end if for i, path in enumerate(fq_fa_list):
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst, accs_to_download, use_index): # Function creates a database with utilities from 'blast+' toolkit # according to acc_dict and your_own_fasta_lst. # # :param tax_annot_res_dir: path to current result directory # (each processed file has it's own result directory); # :type tax_annot_res_dir: str; # :param acc_fpath: path to file "hits_to_download.tsv"; # :type acc_fpath: str; # :param your_own_fasta_lst: list of user's fasta files to be included in database; # :type your_own_fasta_lst: list<str>; # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param use_index: whether to use index; # :type use_index: str; # Returns path to created database. # Path to directory in which database will be placed db_dir = os.path.join(tax_annot_res_dir, "local_database") # Path to DBM taxonomy file taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") try: os.makedirs(db_dir) except OSError: #If this directory exists while True: if len(os.listdir(db_dir)) == 0: # If db directory is empty -- break and build a database break else: print() printlog_info("Database directory is not empty:") printlog_info(" `{}`".format(os.path.abspath(db_dir))) printlog_info("Here is it's content:") for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))): printlog_info(" {}. `{}`".format(i + 1, fname)) # end for reply = input( """\nPress ENTER to start classification using existing database. Enter 'r' to remove all files in this directory and create the database from the beginning:>>""" ) if reply == "": # Do not build a database, just return path to it. printlog_info("You have chosen to use extant database.") # Return path to DB located in this directory dbpath = next(iter(os.listdir(db_dir))) dbpath = dbpath.partition(".fasta")[0] + dbpath.partition( ".fasta")[1] # remove all after '.fasta' return os.path.join(db_dir, dbpath) elif reply == 'r': printlog_info("You have chosen to rebuild the database.") # Rename old classification files and write actual data to new one: old_classif_dirs = filter( lambda d: os.path.exists( os.path.join(d, "classification.tsv")), glob(os.path.join(tax_annot_res_dir, "*"))) old_classif_files = tuple( map(lambda f: os.path.join(f, "classification.tsv"), old_classif_dirs)) if len(old_classif_files) > 0: print() printlog_info("Renaming old classification files:") for classif_file in old_classif_files: rename_file_verbosely(classif_file) # end for # end if # Empty database directory for file in glob("{}{}*".format(db_dir, os.sep)): os.unlink(file) # end for # Break from the loop in order to build a database break else: print("Invalid reply: `{}`\n".format(reply)) continue # end if # end if # end while # end try # It is a dictionary of accessions and record names. # Accessions are keys, record names are values. acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download) if len(accs_to_download) != 0: verify_cl_accessions(accs_to_download, acc_dict) # end if # Retrieve already existing taxonomy data from taxonomy file tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path) # If accession file does not exist and execution has reached here -- everything is OK -- # we are building a database from user's files only. if len(acc_dict) != 0: print() print("""Following sequences (and all replicons related to them) will be downloaded from Genbank for further taxonomic classification on your local machine:\n""") printlog_info( "Following sequences (and all replicons related to them) \ will be downloaded from Genbank for further taxonomic classification \ on your local machine:") for i, acc in enumerate(acc_dict.keys()): printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc])) # end for search_for_related_replicons(acc_dict) printlog_info_time("Completing taxonomy file...") for i, acc in enumerate(acc_dict.keys()): if not acc in tax_exist_accs: taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path) # end if # Accessions can be of different length printn("\r{} - {}: {}/{}".format(getwt(), acc, i + 1, len(acc_dict)) + " " * 10 + "\b" * 10) # end for print() printlog_info_time("Taxonomy file is consistent.") # end if local_fasta = os.path.join( db_dir, "local_seq_set.fasta") # path to downloaded FASTA file add_lambda_phage(local_fasta, taxonomy_path) # add lambda phage control sequence retrieve_fastas_by_acc( acc_dict, db_dir, local_fasta) # download main fasta data from GenBank # Add 'your own' fasta files to database if not len(your_own_fasta_lst) == 0: # This variable counts sequences from local files. # It is necessary for not allowing duplicated accessions. own_seq_counter = 0 # Check if these files are assembly made by SPAdes or a5 spades_patt = r">NODE_[0-9]+" # this pattern will match sequence IDs generated y SPAdes a5_patt = r">scaffold_[0-9]+" # this pattern will match sequence IDs generated y a5 assemblies = list( ) # this list will contain paths to assembly files (SPAdes or a5) for own_fasta_path in reversed(your_own_fasta_lst): how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: first_seq_id = fmt_func(fasta_file.readline( )) # get the first line in file (the first seq ID) # end with # if we've got SPAdes assembly if not re.search(spades_patt, first_seq_id) is None: assemblies.append(own_fasta_path) # Remove these file from list -- they will be processed in a specific way your_own_fasta_lst.remove(own_fasta_path) continue # end if # if we've got a5 assembly if not re.search(a5_patt, first_seq_id) is None: assemblies.append(own_fasta_path) your_own_fasta_lst.remove(own_fasta_path) continue # end if # end for # Include assemblies files in multi-fasta file # Find common prefix of all assembly paths and remove it from assembly names if len(assemblies) > 1: assemblies_formatted = tuple( map(lambda f: os.path.abspath(f).replace(os.sep, '-'), assemblies)) common_prefix = find_common_prefix(assemblies_formatted) assemblies_formatted = tuple( map(lambda f: f.replace(common_prefix, ''), assemblies_formatted)) elif len(assemblies) > 0: common_prefix = '' assemblies_formatted = tuple(map(os.path.basename, assemblies)) # end if # Add assembled sequences to database with open(local_fasta, 'a') as fasta_db: for i, assm_path in enumerate(assemblies): printlog_info("Adding `{}` to database...".format( os.path.basename(assm_path))) assm_name_fmt = assemblies_formatted[i] how_to_open = OPEN_FUNCS[is_gzipped(assm_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)] with how_to_open(assm_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # You can find comments to "OWN_SEQ..." below. # Paths will be written to seq IDs in following way: # some-happy-path.fastq-- # in order to retrieve them securely with regex later. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) own_def = "{}--".format( assm_name_fmt.replace(common_prefix, '')) + line[1:] own_def = remove_bad_chars(own_def) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, own_def) line = ">" + "{} {}".format(own_acc, own_def) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with with open(local_fasta, 'a') as fasta_db: for own_fasta_path in your_own_fasta_lst: printlog_info("Adding `{}` to database...".format( os.path.basename(own_fasta_path))) how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # 'makeblastdb' considers first word (sep. is space) as sequence ID # and throws an error if there are duplicated IDs. # In order not to allow this duplication we'll create our own sequence IDs: # 'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, line[1:]) line = ">" + own_acc + ' ' + remove_bad_chars( line[1:]) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with # end if # 'lcl|ACCESSION...' entries can be given with '.1' # (or '.2', whatever) terminus by blastn. # There is no '.1' terminus in taxonomy file. # Therefore we will prune accessions in advance. print() printn("{} - Formatting accessions...".format(getwt())) log_info("Formatting accessions...") corrected_path = os.path.join(db_dir, "corrected_seqs.fasta") with open(local_fasta, 'r') as source_file, open(corrected_path, 'w') as dest_file: for line in source_file: if line.startswith('>'): line = line.strip() acc, seq_name = (line.partition(' ')[0], line.partition(' ')[2]) acc = acc.partition('.')[0] seq_name = remove_bad_chars(seq_name) seq_name = re.sub(r'[^\x00-\x7F]+', '_', seq_name) # remove non-ascii chars line = ' '.join((acc, seq_name)) + '\n' # end if dest_file.write(line) # end for # end with os.unlink(local_fasta) os.rename(corrected_path, local_fasta) sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt())) log_info("Formatting accessions done.") # Configure command line make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format( local_fasta) exit_code = os.system(make_db_cmd) # make a blast-format database if exit_code != 0: printlog_error_time("Error occured while making the database") platf_depend_exit(exit_code) # end if print("\033[1A{} - Database is successfully created: `{}`\n".format( getwt(), local_fasta)) log_info("Database is successfully created: `{}`".format(local_fasta)) if use_index == "true": printlog_info_time("Database index creating started") # Configure command line make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format( local_fasta) exit_code = os.system( make_index_cmd) # create an index for the database if exit_code != 0: printlog_info_time("Error occured while creating database index") platf_depend_exit(exit_code) # end if printlog_info_time("Database index has been successfully created") # end if # Gzip downloaded FASTA file printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta)) if gzip_util_found: os.system("{} -v {}".format(gzip_util, local_fasta)) else: # form .fasta.gz file 'by hand' with open(local_fasta, 'rb') as fasta_file, open_as_gzip(local_fasta + ".gz", "wb") as fagz_file: shutil_copyfileobj(fasta_file, fagz_file) # end with os.unlink(local_fasta) # remove source FASTA file, not the database # end if return local_fasta
def report_run_params(args): # Funtion prints run parameters. # # :param args: argument dictionary returned by handle_args; # :type args: dict; print() printlog_info(' -- Run parameters --') printlog_info(' General:') printlog_info('- Tasks: {}.'.format(', '.join(args['tasks']))) printlog_info('- Forward reads: `{}`.'.format(args['1'])) if not args['2'] is None: printlog_info('- Reverse reads: `{}`.'.format(args['2'])) else: printlog_info('- Reverse reads: none.') # end if printlog_info('- Output directory: `{}`.'.format(args['o'])) printlog_info('- Threads: {}.'.format(args['t'])) printlog_info('- Gzip output files afterwards: {}.'.format(args['z'])) printlog_info(' Crosstalks detection:') if args['r'] is None: printlog_info('- Primers: standard Illumina 16S rRNA V3-V4 primers.') else: printlog_info('- Primers file: `{}`.'.format(args['r'])) # end if printlog_info('- Threshold: {}.'.format(args['x'])) printlog_info('- Max offset: {}.'.format(args['s'])) printlog_info('- Cut off primers: {}.'.format(args['c'])) printlog_info(' Read merging:') printlog_info('- Minimum overlap: {}.'.format(args['m'])) printlog_info('- Mismatch fraction: {}.'.format(args['p'])) printlog_info('- Phred offset: {}.'.format(args['q'])) printlog_info('- NGmerge path: {}.'.format(args['ngmerge-path'])) print('-' * 10 + '\n')
def submit(packet, packet_size, packet_mode, pack_to_send, seqs_processed, fq_fa_path, tmp_fpath, taxonomy_path, tsv_res_path, acc_fpath, blast_algorithm, user_email, organisms, acc_dict, out_of_n): # :param packet: "packet" dictionary described in "barapost-prober.py" before the kernel loop: # :type packet: dict; # :param packet_size: size of the packet (see option `-c` for definition); # :type packet_size: int; # :param packet_mode: packet forming mode (see option `-c` for definition); # :type packet_mode: int; # :param pack_to_send: ordinal number of packet to send # (it is list rather that in because it should be mutable); # :type pack_to_send: list<int>; # :param seqs_processed: nuber of sequnces processed # (it is list rather that in because it should be mutable); # :type seqs_processed: list<int>; # :param fq_fa_path: path to current input file; # :type fq_fa_path: str; # :param tmp_fpath: path to current temporary file; # :type tmp_fpath: str; # :param taxonomy_path: path to taxonomt file; # :type taxonomy_path: str; # :param tsv_res_path: path to current classification file; # :type tsv_res_path: str; # :param acc_fpath: path to file `hits_to_download.tsv`; # :type acc_fpath: str; # :param blast_algorithm: BLAST algorithm to use (see option `-a`); # :type blast_algorithm: str; # :param user_email: user email ot send with request; # :type user_email: str; # :param organisms: list of strings performing `nt` database slices; # :type organisms: list<str>; # :param acc_dict: accession dictionary for writing to `hits_to_download.tsv`; # :type acc_dict: dict<str: (str, int)>; # :param out_of_n: dictionary for printing how many packets left; # :type out_of_n: dict<str: str, str: int>; s_letter = 's' if len(packet["qual"]) != 1 else '' print() printlog_info("Going to BLAST (" + blast_algorithm + ")") # Count base pairs in packet lines = filter(lambda x: not x.startswith('>'), packet["fasta"].splitlines()) totalbp = len(''.join(map(lambda x: x.strip(), lines))) totalbp = "{:,}".format(totalbp) del lines printlog_info("Request number {}{}. Sending {} sequence{} ({} b.p. totally)."\ .format(pack_to_send[0], out_of_n["msg"], len(packet["qual"]), s_letter, totalbp)) error = BlastError(-1) while error.code != 0: # until successfull attempt # Get the request request = configure_request(packet["fasta"], blast_algorithm, organisms, user_email) # Send the request and get BLAST XML response. # 'align_xml_text' will be None if an error occurs. align_xml_text, error = send_request(request, pack_to_send, packet_size, packet_mode, os.path.basename(fq_fa_path), tmp_fpath) if error.code == 0: # Write results and leave the loop _handle_result(align_xml_text, packet, taxonomy_path, tsv_res_path, acc_dict, acc_fpath, seqs_processed, pack_to_send, tmp_fpath) elif error.code == 2: # If NCBI BLAST server rejects the request due to too large amount of data in it -- # split packet into two or, if there is only one sequence in it -- prune this sequence. # Then resend the request. _split_and_resubmit(packet, packet_size, packet_mode, pack_to_send, seqs_processed, fq_fa_path, tmp_fpath, taxonomy_path, tsv_res_path, acc_fpath, blast_algorithm, user_email, organisms, acc_dict, out_of_n) error = BlastError( 0) # _split_and_resubmit will process packet successfully
def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path, nfiles): # Function performs 'many_files'-parallel mode of barapost-local.py. # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed; # :type fq_fa_list: list<str>; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; # :param nfiles: total number of files; # :type nfiles: int; queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp") # Iterate over source FASTQ and FASTA files for fq_fa_path in fq_fa_list: # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join( new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data[ "n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data[ "tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum( 1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len( tuple( filter( lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: with print_lock: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format( os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end with # end try # end if if num_seqs == num_done_seqs: with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\ format(i, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") # end with continue # end if for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs): # Blast the packet align_xml_text = launch_blastn(packet["fasta"], blast_algorithm, use_index, queries_tmp_dir, db_path) # Cnfigure result TSV lines result_tsv_lines = parse_align_results_xml(align_xml_text, packet["qual"]) # Write the result to tsv write_classification(result_tsv_lines, tsv_res_path) # end for with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end with # end for query_fpath = os.path.join(queries_tmp_dir, "query{}_tmp.fasta".format(os.getpid())) remove_tmp_files(query_fpath)