def cazy_parser(tsv, namemap, output, summary_method='best', min_identity=60, min_bitscore=0, min_length=60, max_evalue=0.000001, top_fraction=1, max_hits=10, table_name="cazy"): """Parse BLAST hits from CAZy reference database. The BLAST hits are assumed to be sorted by query with decreasing bitscores (best alignment first): \b sort -k1,1 -k12,12rn tsv > sorted_tsv Expected columns in the CAZy database: \b cazy_gene cazy_family cazy_class cazy_ec For a given gene match, all possible ECs are returned in a single line separated by '|'. Args: tsv (str): blast hits file path in default tabular format namemap (str): sqlite database file path output (str): :py:class:`click.File` in write mode summary_method (str): either 'majority' or 'best'; summary method for annotating ORFs; when majority and there is no majority, best is used min_identity (int): minimum allowable percent ID of BLAST hit min_bitscore (int): minimum allowable bitscore of BLAST hit; 0 disables min_length (int): minimum allowable BLAST alignment length max_evalue (float): maximum allowable e-value of BLAST hit top_fraction (float): filters ORF BLAST hits before finding majority by only keep hits within this fraction, e.g. 0.98, of the highest bitscore max_hits (int): maximum number of BLAST hits to consider when summarizing ORFs as a majority table_name (str): table name within namemap database; expected columns are listed above """ logging.info("Parsing %s" % tsv) if top_fraction == 1: top_fraction = None print("contig", "orf", "cazy_gene", "cazy_family", "cazy_class", "cazy_ec", "%s_evalue" % table_name, "%s_bitscore" % table_name, sep="\t", file=output) with contextlib.closing(sqlite3.connect(namemap)) as conn, gzopen(tsv) as blast_tab_fh: cursor = conn.cursor() for query, qgroup in groupby(blast_tab_fh, key=lambda x: x.partition("\t")[0]): contig_name, _, orf_idx = query.rpartition("_") hit_id, evalue, bitscore = get_hit_from_blast_group(qgroup, max_hits, top_fraction, min_length, min_identity, max_evalue, min_bitscore, summary_method) cazy_gene = "NA" cazy_family = "NA" cazy_class = "NA" cazy_ec = "NA" # everything could have been filtered out due to user constraints if hit_id: cursor.execute('SELECT cazy_gene, cazy_family, cazy_class, cazy_ec \ FROM %s \ WHERE cazy_gene="%s"' % (table_name, hit_id)) cazy_gene, cazy_family, cazy_class, cazy_ec = cursor.fetchone() print(contig_name, "%s_%s" % (contig_name, orf_idx), cazy_gene, cazy_family, cazy_class, cazy_ec, evalue, bitscore, sep="\t", file=output) logging.info("Complete")
def parse_blast_results_with_tree(blast_tab, name_map, summary_method, tree, min_identity=70, min_bitscore=0, min_length=60, max_evalue=0.000001, max_hits_per_orf=10, top_fraction_of_hits=None, table_name="refseq", lca_threshold=1): """Parse BLAST results (-outfmt 6), filter, and aggregate ORF taxonomies. Args: blast_tab (str): file path to blast TSV file name_map (dict): dict of tuples from parse_tree_annotation summary_method (dict): method of ORF annotation selection lca_threshold (float): the first parent above this fraction of representation (its count is greater than the total * lca_threshold) Returns: dict: dict of dicts where first key is contig name, inner key is ORF ID; values are tuple of protein function, taxonomy ID, bitscore, evalue Raises: AssertionError when ORF summary method is not supported (['lca', 'best', 'majority']) """ # allowing 1 and 0 to disable if top_fraction_of_hits == 1: top_fraction_of_hits = None assert summary_method in ["lca", "best", "majority"] contigs = defaultdict(dict) with contextlib.closing(sqlite3.connect(name_map)) as conn, gzopen( blast_tab) as blast_tab_fh: cursor = conn.cursor() # group hits by ORF (column 2) for orf_id, qgroup in groupby(blast_tab_fh, key=lambda x: x.split("\t")[1]): protein_function = "hypothetical protein" protein_set = False taxonomy_id = "1" bitscore = "NA" evalue = "NA" orf_hits = BlastHits(max_hits=max_hits_per_orf, top_fraction=top_fraction_of_hits) lines = [] # iterate over blast hits per ORF for hsp in qgroup: # HSPs will now have contig in column 1 toks = hsp.strip().split("\t") # remove extra column from toks contig_name = toks.pop(0) # convert toks to dictionary toks = dict(zip(BLAST6, toks)) if (int(toks["length"]) < min_length or float(toks["pident"]) < min_identity or float(toks["evalue"]) > max_evalue): continue if min_bitscore and float(toks["bitscore"]) < min_bitscore: # input is sorted by decreasing bitscore break cursor.execute( 'SELECT function, taxonomy FROM %s WHERE name="%s"' % (table_name, toks["sseqid"])) current_function, current_taxonomy = cursor.fetchone() # update taxonomy based on pident; would be similar to 16S taxonomy assignments # current_taxonomy = tree.climb_tree(current_taxonomy, float(toks["pident"])) if summary_method == "best": taxonomy_id = current_taxonomy protein_function = current_function bitscore = toks["bitscore"] evalue = toks["evalue"] break # TODO implement bitscore ratio as a measure of alignment quality as a function of input sequence orf_hits.add(current_taxonomy, toks["bitscore"]) toks["current_function"] = current_function toks["current_taxonomy"] = current_taxonomy lines.append(toks) # summary method is majority and we have passing HSPs if not summary_method == "best" and lines: if summary_method == "majority": taxonomy_id = orf_hits.majority() for toks in lines: if toks["current_taxonomy"] == taxonomy_id: bitscore = toks["bitscore"] evalue = toks["evalue"] protein_function = toks["current_function"] break # summary method is 'lca' else: orf_hits.names.reverse() taxonomy_id = tree.lca(orf_hits.names, threshold=lca_threshold) # grabbing best hit's bitscore and evalue bitscore = lines[0]["bitscore"] evalue = lines[0]["evalue"] protein_function = lines[0]["current_function"] if bitscore == "NA": logging.critical( "The summarized ID (%s) was not assigned metadata" % taxonomy_id) contigs[contig_name][orf_id] = (protein_function, taxonomy_id, bitscore, evalue) return contigs
def eggnog_parser(tsv, namemap, output, summary_method, min_identity, min_bitscore, min_length, max_evalue, top_fraction, max_hits, table_name): """Parse BLAST hits from EGGNOG. The BLAST hits are assumed to be sorted by query with decreasing bitscores (best alignment first): \b sort -k1,1 -k12,12rn tsv > sorted_tsv Expected columns in the EggNOG database: \b uniprot_ac eggnog_ssid_b eggnog_species_id uniprot_id ko_id ko_level1_name ko_level2_name ko_level3_id ko_level3_name ko_gene_symbol ko_product ko_ec Args: tsv (str): blast hits file path in default tabular format namemap (str): sqlite database file path output (str): :py:class:`click.File` in write mode summary_method (str): either 'majority' or 'best'; summary method for annotating ORFs; when majority and there is no majority, best is used min_identity (int): minimum allowable percent ID of BLAST hit min_bitscore (int): minimum allowable bitscore of BLAST hit; 0 disables min_length (int): minimum allowable BLAST alignment length max_evalue (float): maximum allowable e-value of BLAST hit top_fraction (float): filters ORF BLAST hits before finding majority by only keep hits within this fraction, e.g. 0.98, of the highest bitscore max_hits (int): maximum number of BLAST hits to consider when summarizing ORFs as a majority table_name (str): table name within namemap database; expected columns are listed above """ logging.info("Parsing %s" % tsv) if top_fraction == 1: top_fraction = None print("contig", "orf", "uniprot_ac", "eggnog_ssid_b", "eggnog_species_id", "uniprot_id", "ko_id", "ko_level1_name", "ko_level2_name", "ko_level3_id", "ko_level3_name", "ko_gene_symbol", "ko_product", "ko_ec", "%s_evalue" % table_name, "%s_bitscore" % table_name, sep="\t", file=output) with contextlib.closing(sqlite3.connect(namemap)) as conn, gzopen(tsv) as blast_tab_fh: cursor = conn.cursor() for query, qgroup in groupby(blast_tab_fh, key=lambda x: x.partition("\t")[0]): contig_name, _, orf_idx = query.rpartition("_") hit_id, evalue, bitscore = get_hit_from_blast_group(qgroup, max_hits, top_fraction, min_length, min_identity, max_evalue, min_bitscore, summary_method) uniprot_ac = "NA" eggnog_ssid_b = "NA" eggnog_species_id = "NA" uniprot_id = "NA" ko_id = "NA" ko_level1_name = "NA" ko_level2_name = "NA" ko_level3_id = "NA" ko_level3_name = "NA" ko_gene_symbol = "NA" ko_product = "NA" ko_ec = "NA" # everything could have been filtered out due to user constraints if hit_id: cursor.execute('SELECT uniprot_ac, eggnog_ssid_b, eggnog_species_id, uniprot_id, \ ko_id, ko_level1_name, ko_level2_name, \ ko_level3_id, ko_level3_name, ko_gene_symbol, ko_product, ko_ec \ FROM %s \ WHERE eggnog_ssid_b="%s"' % (table_name, hit_id)) try: uniprot_ac, eggnog_ssid_b, eggnog_species_id, uniprot_id, ko_id, \ ko_level1_name, ko_level2_name, ko_level3_id, ko_level3_name, \ ko_gene_symbol, ko_product, ko_ec = cursor.fetchone() # legacy before database was pruned; can have hits not in metadata except TypeError: logging.warning("'%s' not present in database" % hit_id) pass # print for this query print(contig_name, "%s_%s" % (contig_name, orf_idx), uniprot_ac, eggnog_ssid_b, eggnog_species_id, uniprot_id, ko_id, ko_level1_name, ko_level2_name, ko_level3_id, ko_level3_name, ko_gene_symbol, ko_product, ko_ec, evalue, bitscore, sep="\t", file=output) logging.info("Complete")