def config_taxonomy_own_seq(taxonomy_str): # Function parses ID of user's reference sequence and forms a taxonomy tuple # if there is proper taxonomy string in ID line in fasta format. # "Proper" taxonomy string is following: # '[ANYTHING BEFORE] <Domain>;<Phylum>;<Class>;<Order>;<Family>;<Genus>;<species> [ANYTHING AFTER]' # Spaces are not allowed. Ranks can be omitted in manner like this # (order and species is missing): # '[ANYTHING BEFORE] <Domain>;<Phylum>;<Class>;;<Family>;<Genus>; [ANYTHING AFTER]' # If there is no taxonomy string in sequence ID, we'll merely save this ID to taxonomy file. # # :param taxonomy_str: taxonomy string to parse; # :type taxonomy_str: str; # # Returns taxonomy string. # Check if `taxonomy_str` matches `proposed_fmt` proper_tax_match = re.search(proposed_fmt, taxonomy_str) # If there is a match and it taxonomic names are not empty, # form taxonomic tuple: if not proper_tax_match is None and proper_tax_match.group( 0) != ";" * (len(ranks) - 1): taxonomy = proper_tax_match.group(0) # Otherwise we will merely use this sequence ID else: taxonomy = remove_bad_chars(taxonomy_str) # end if return taxonomy
def parse_align_results_xml(xml_text, qual_dict, acc_dict, taxonomy_path): # Function parses BLAST xml response and returns tsv lines containing gathered information: # 1. Query name. # 2. Hit name formatted by 'format_taxonomy_name()' function. # 3. Hit accession. # 4. Length of query sequence. # 5. Length of alignment. # 6. Percent of identity. # 7. Percent of gaps. # 8. E-value. # 9. Average quality of a read (if source file is FASTQ). # 10. Read accuracy (%) (if source file is FASTQ). # # :param xml_text: XML text with results of alignment; # :type xml_text: str; # :param qual_dict: dict, which maps sequence IDs to their quality; # :type qual_dict: dict<str: float>; # :param acc_dict: dictionary comntaining accession data of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; # :param taxonomy_path: path to DBM file with taxonomy; # :type taxonomy_path: str; # # Returns list<str>. result_tsv_lines = list() # /=== Parse BLAST XML response ===/ root = ElementTree.fromstring(xml_text) # get tree instance # Iterate over "Iteration" and "Iteration_hits" nodes for iter_elem, iter_hit in zip(root.iter("Iteration"), root.iter("Iteration_hits")): # "Iteration" node contains query name information query_name = sys.intern(iter_elem.find("Iteration_query-def").text) query_len = iter_elem.find("Iteration_query-len").text avg_quality = qual_dict[query_name] if avg_quality != '-': miscall_prop = round(10**(avg_quality / -10), 3) accuracy = round(100 * (1 - miscall_prop), 2) # expected percent of correctly called bases qual_info_to_print = " Average quality of this read is {}, i.e. accuracy is {}%;\n".format( avg_quality, accuracy) else: # If FASTA file is processing, print dashed in quality columns avg_quality = "-" accuracy = "-" # expected percent of correctly called bases qual_info_to_print = "" # end if # Check if there are any hits chck_h = iter_hit.find("Hit") if chck_h is None: # If there is no hit for current sequence print( "\n{} -- No significant similarity found;\n Query length - {};" .format(query_name, query_len)) result_tsv_lines.append('\t'.join( (query_name, "No significant similarity found", "-", query_len, "-", "-", "-", "-", str(avg_quality), str(accuracy)))) else: # If there are any hits, node "Iteration_hits" contains at least one "Hit" child # Get first-best bitscore and iterato over hits that have the save (i.e. the highest bitscore): top_bitscore = next( chck_h.find("Hit_hsps").iter("Hsp")).find("Hsp_bit-score").text annotations = list() hit_accs = list() for hit in iter_hit: # Find the first HSP hsp = next(hit.find("Hit_hsps").iter("Hsp")) if hsp.find("Hsp_bit-score").text != top_bitscore: break # end if # Get full hit name (e.g. "Erwinia amylovora strain S59/5, complete genome") hit_def = remove_bad_chars(hit.find("Hit_def").text) annotations.append(hit_def) curr_acc = sys.intern(hit.find("Hit_accession").text) hit_accs.append(curr_acc) # get hit accession # Get taxonomy find_taxonomy(curr_acc, hit_def, taxonomy_path) # Update accession dictionary try: acc_dict[curr_acc][1] += 1 except KeyError: acc_dict[curr_acc] = [hit_def, 1] # end try align_len = hsp.find("Hsp_align-len").text.strip() pident = hsp.find( "Hsp_identity").text # get number of matched nucleotides gaps = hsp.find("Hsp_gaps").text # get number of gaps evalue = hsp.find("Hsp_evalue").text # get e-value pident_ratio = round(float(pident) / int(align_len) * 100, 2) gaps_ratio = round(float(gaps) / int(align_len) * 100, 2) # end for # Divide annotations and accessions with '&&' annotations = '&&'.join(annotations) hit_accs = '&&'.join(hit_accs) print("""\n{} - {} Query length - {} nt; Identity - {}/{} ({}%); Gaps - {}/{} ({}%);""".format( query_name, annotations, query_len, pident, align_len, pident_ratio, gaps, align_len, gaps_ratio)) # Append new tsv line containing recently collected information result_tsv_lines.append('\t'.join( (query_name, annotations, hit_accs, query_len, align_len, pident, gaps, evalue, str(avg_quality), str(accuracy)))) # end if printn(qual_info_to_print) # end for return result_tsv_lines
def parse_align_results_xml(xml_text, qual_dict): # Function parses BLAST xml response and returns tsv lines containing gathered information: # 1. Query name. # 2. Hit name formatted by 'format_taxonomy_name()' function. # 3. Hit accession. # 4. Length of query sequence. # 5. Length of alignment. # 6. Percent of identity. # 7. Percent of gaps. # 8. E-value. # 9. Average Phred33 quality of a read (if source file is FASTQ). # 10. Read accuracy (%) (if source file is FASTQ). # # :param xml_text: XML text with results of alignment; # :type xml_text: str; # :param qual_dict: dict, which maps sequence IDs to their quality; # :type qual_dict: dict<str: float>; # # Returns list<str>. result_tsv_lines = list() # /=== Parse BLAST XML response ===/ root = ElementTree.fromstring(xml_text) # get tree instance # Iterate over "Iteration" and "Iteration_hits" nodes for iter_elem, iter_hit in zip(root.iter("Iteration"), root.iter("Iteration_hits")): # "Iteration" node contains query name information query_name = iter_elem.find("Iteration_query-def").text query_len = iter_elem.find("Iteration_query-len").text avg_quality = qual_dict[query_name] if avg_quality != '-': miscall_prop = round(10**(avg_quality / -10), 3) accuracy = round(100 * (1 - miscall_prop), 2) # expected percent of correctly called bases else: # If FASTA file is processing, print dashed in quality columns avg_quality = "-" accuracy = "-" # expected percent of correctly called bases # end if # Check if there are any hits chck_h = iter_hit.find("Hit") if chck_h is None: # If there is no hit for current sequence result_tsv_lines.append('\t'.join( (query_name, "No significant similarity found", "-", query_len, "-", "-", "-", "-", str(avg_quality), str(accuracy)))) else: # If there are any hits, node "Iteration_hits" contains at least one "Hit" child # Get first-best bitscore and iterato over hits that have the save (i.e. the highest bitscore): top_bitscore = next( chck_h.find("Hit_hsps").iter("Hsp")).find("Hsp_bit-score").text annotations = list() hit_accs = list() for hit in iter_hit: # Find the first HSP (we need only the first one) hsp = next(hit.find("Hit_hsps").iter("Hsp")) if hsp.find("Hsp_bit-score").text != top_bitscore: break # end if curr_acc = sys.intern( hit.find("Hit_accession").text) # get hit accession hit_accs.append(curr_acc) # Get full hit name (e.g. "Erwinia amylovora strain S59/5, complete genome") hit_def = remove_bad_chars(hit.find("Hit_def").text) annotations.append(hit_def) align_len = hsp.find("Hsp_align-len").text.strip() pident = hsp.find( "Hsp_identity").text # get number of matched nucleotides gaps = hsp.find("Hsp_gaps").text # get number of gaps evalue = hsp.find("Hsp_evalue").text # get e-value # end for # Divide annotations and accessions with '&&' annotations = '&&'.join(annotations) hit_accs = '&&'.join(hit_accs) # Append new tsv line containing recently collected information result_tsv_lines.append('\t'.join( (query_name, annotations, hit_accs, query_len, align_len, pident, gaps, evalue, str(avg_quality), str(accuracy)))) # end if # end for return result_tsv_lines
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst, accs_to_download, use_index): # Function creates a database with utilities from 'blast+' toolkit # according to acc_dict and your_own_fasta_lst. # # :param tax_annot_res_dir: path to current result directory # (each processed file has it's own result directory); # :type tax_annot_res_dir: str; # :param acc_fpath: path to file "hits_to_download.tsv"; # :type acc_fpath: str; # :param your_own_fasta_lst: list of user's fasta files to be included in database; # :type your_own_fasta_lst: list<str>; # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param use_index: whether to use index; # :type use_index: str; # Returns path to created database. # Path to directory in which database will be placed db_dir = os.path.join(tax_annot_res_dir, "local_database") # Path to DBM taxonomy file taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") try: os.makedirs(db_dir) except OSError: #If this directory exists while True: if len(os.listdir(db_dir)) == 0: # If db directory is empty -- break and build a database break else: print() printlog_info("Database directory is not empty:") printlog_info(" `{}`".format(os.path.abspath(db_dir))) printlog_info("Here is it's content:") for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))): printlog_info(" {}. `{}`".format(i + 1, fname)) # end for reply = input( """\nPress ENTER to start classification using existing database. Enter 'r' to remove all files in this directory and create the database from the beginning:>>""" ) if reply == "": # Do not build a database, just return path to it. printlog_info("You have chosen to use extant database.") # Return path to DB located in this directory dbpath = next(iter(os.listdir(db_dir))) dbpath = dbpath.partition(".fasta")[0] + dbpath.partition( ".fasta")[1] # remove all after '.fasta' return os.path.join(db_dir, dbpath) elif reply == 'r': printlog_info("You have chosen to rebuild the database.") # Rename old classification files and write actual data to new one: old_classif_dirs = filter( lambda d: os.path.exists( os.path.join(d, "classification.tsv")), glob(os.path.join(tax_annot_res_dir, "*"))) old_classif_files = tuple( map(lambda f: os.path.join(f, "classification.tsv"), old_classif_dirs)) if len(old_classif_files) > 0: print() printlog_info("Renaming old classification files:") for classif_file in old_classif_files: rename_file_verbosely(classif_file) # end for # end if # Empty database directory for file in glob("{}{}*".format(db_dir, os.sep)): os.unlink(file) # end for # Break from the loop in order to build a database break else: print("Invalid reply: `{}`\n".format(reply)) continue # end if # end if # end while # end try # It is a dictionary of accessions and record names. # Accessions are keys, record names are values. acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download) if len(accs_to_download) != 0: verify_cl_accessions(accs_to_download, acc_dict) # end if # Retrieve already existing taxonomy data from taxonomy file tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path) # If accession file does not exist and execution has reached here -- everything is OK -- # we are building a database from user's files only. if len(acc_dict) != 0: print() print("""Following sequences (and all replicons related to them) will be downloaded from Genbank for further taxonomic classification on your local machine:\n""") printlog_info( "Following sequences (and all replicons related to them) \ will be downloaded from Genbank for further taxonomic classification \ on your local machine:") for i, acc in enumerate(acc_dict.keys()): printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc])) # end for search_for_related_replicons(acc_dict) printlog_info_time("Completing taxonomy file...") for i, acc in enumerate(acc_dict.keys()): if not acc in tax_exist_accs: taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path) # end if # Accessions can be of different length printn("\r{} - {}: {}/{}".format(getwt(), acc, i + 1, len(acc_dict)) + " " * 10 + "\b" * 10) # end for print() printlog_info_time("Taxonomy file is consistent.") # end if local_fasta = os.path.join( db_dir, "local_seq_set.fasta") # path to downloaded FASTA file add_lambda_phage(local_fasta, taxonomy_path) # add lambda phage control sequence retrieve_fastas_by_acc( acc_dict, db_dir, local_fasta) # download main fasta data from GenBank # Add 'your own' fasta files to database if not len(your_own_fasta_lst) == 0: # This variable counts sequences from local files. # It is necessary for not allowing duplicated accessions. own_seq_counter = 0 # Check if these files are assembly made by SPAdes or a5 spades_patt = r">NODE_[0-9]+" # this pattern will match sequence IDs generated y SPAdes a5_patt = r">scaffold_[0-9]+" # this pattern will match sequence IDs generated y a5 assemblies = list( ) # this list will contain paths to assembly files (SPAdes or a5) for own_fasta_path in reversed(your_own_fasta_lst): how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: first_seq_id = fmt_func(fasta_file.readline( )) # get the first line in file (the first seq ID) # end with # if we've got SPAdes assembly if not re.search(spades_patt, first_seq_id) is None: assemblies.append(own_fasta_path) # Remove these file from list -- they will be processed in a specific way your_own_fasta_lst.remove(own_fasta_path) continue # end if # if we've got a5 assembly if not re.search(a5_patt, first_seq_id) is None: assemblies.append(own_fasta_path) your_own_fasta_lst.remove(own_fasta_path) continue # end if # end for # Include assemblies files in multi-fasta file # Find common prefix of all assembly paths and remove it from assembly names if len(assemblies) > 1: assemblies_formatted = tuple( map(lambda f: os.path.abspath(f).replace(os.sep, '-'), assemblies)) common_prefix = find_common_prefix(assemblies_formatted) assemblies_formatted = tuple( map(lambda f: f.replace(common_prefix, ''), assemblies_formatted)) elif len(assemblies) > 0: common_prefix = '' assemblies_formatted = tuple(map(os.path.basename, assemblies)) # end if # Add assembled sequences to database with open(local_fasta, 'a') as fasta_db: for i, assm_path in enumerate(assemblies): printlog_info("Adding `{}` to database...".format( os.path.basename(assm_path))) assm_name_fmt = assemblies_formatted[i] how_to_open = OPEN_FUNCS[is_gzipped(assm_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)] with how_to_open(assm_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # You can find comments to "OWN_SEQ..." below. # Paths will be written to seq IDs in following way: # some-happy-path.fastq-- # in order to retrieve them securely with regex later. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) own_def = "{}--".format( assm_name_fmt.replace(common_prefix, '')) + line[1:] own_def = remove_bad_chars(own_def) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, own_def) line = ">" + "{} {}".format(own_acc, own_def) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with with open(local_fasta, 'a') as fasta_db: for own_fasta_path in your_own_fasta_lst: printlog_info("Adding `{}` to database...".format( os.path.basename(own_fasta_path))) how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # 'makeblastdb' considers first word (sep. is space) as sequence ID # and throws an error if there are duplicated IDs. # In order not to allow this duplication we'll create our own sequence IDs: # 'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, line[1:]) line = ">" + own_acc + ' ' + remove_bad_chars( line[1:]) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with # end if # 'lcl|ACCESSION...' entries can be given with '.1' # (or '.2', whatever) terminus by blastn. # There is no '.1' terminus in taxonomy file. # Therefore we will prune accessions in advance. print() printn("{} - Formatting accessions...".format(getwt())) log_info("Formatting accessions...") corrected_path = os.path.join(db_dir, "corrected_seqs.fasta") with open(local_fasta, 'r') as source_file, open(corrected_path, 'w') as dest_file: for line in source_file: if line.startswith('>'): line = line.strip() acc, seq_name = (line.partition(' ')[0], line.partition(' ')[2]) acc = acc.partition('.')[0] seq_name = remove_bad_chars(seq_name) seq_name = re.sub(r'[^\x00-\x7F]+', '_', seq_name) # remove non-ascii chars line = ' '.join((acc, seq_name)) + '\n' # end if dest_file.write(line) # end for # end with os.unlink(local_fasta) os.rename(corrected_path, local_fasta) sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt())) log_info("Formatting accessions done.") # Configure command line make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format( local_fasta) exit_code = os.system(make_db_cmd) # make a blast-format database if exit_code != 0: printlog_error_time("Error occured while making the database") platf_depend_exit(exit_code) # end if print("\033[1A{} - Database is successfully created: `{}`\n".format( getwt(), local_fasta)) log_info("Database is successfully created: `{}`".format(local_fasta)) if use_index == "true": printlog_info_time("Database index creating started") # Configure command line make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format( local_fasta) exit_code = os.system( make_index_cmd) # create an index for the database if exit_code != 0: printlog_info_time("Error occured while creating database index") platf_depend_exit(exit_code) # end if printlog_info_time("Database index has been successfully created") # end if # Gzip downloaded FASTA file printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta)) if gzip_util_found: os.system("{} -v {}".format(gzip_util, local_fasta)) else: # form .fasta.gz file 'by hand' with open(local_fasta, 'rb') as fasta_file, open_as_gzip(local_fasta + ".gz", "wb") as fagz_file: shutil_copyfileobj(fasta_file, fagz_file) # end with os.unlink(local_fasta) # remove source FASTA file, not the database # end if return local_fasta