Пример #1
0
def get_desired_ranks(taxid, desired_ranks):
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage)
    lineage2ranks = ncbi.get_rank(names)
    ranks2lineage = dict(
        (rank, taxid) for (taxid, rank) in lineage2ranks.items())
    return [ranks2lineage.get(rank, '0') for rank in desired_ranks]
Пример #2
0
    def annotate_ncbi_taxa(self,
                           taxid_attr='species',
                           tax2name=None,
                           tax2track=None,
                           tax2rank=None,
                           dbfile=None):
        """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are
        expected to contain a feature (name, by default) encoding a valid taxid
        number.

        All descendant nodes (including internal nodes) are annotated with the
        following new features:

        `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database

        `Node.named_lineage`: the NCBI lineage track using scientific names 

        `Node.taxid`: NCBI taxid number 

        `Node.lineage`: same as named_lineage but using taxid codes. 
        

        Note that for internal nodes, NCBI information will refer to the first
        common lineage of the grouped species.

        :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node. 

        :param None tax2name: A dictionary where keys are taxid numbers and
        values are their translation into NCBI scientific name. Its use is
        optional and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None tax2track: A dictionary where keys are taxid numbers and
        values are their translation into NCBI lineage tracks (taxids). Its use is
        optional and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None tax2rank: A dictionary where keys are taxid numbers and
        values are their translation into NCBI rank name. Its use is optional
        and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None dbfile : If provided, the provided file will be used as a
        local copy of the NCBI taxonomy database.

        :returns: tax2name (a dictionary translating taxid numbers into
        scientific name), tax2lineage (a dictionary translating taxid numbers
        into their corresponding NCBI lineage track) and tax2rank (a dictionary translating taxid numbers into
        rank names).

        """

        ncbi = NCBITaxa(dbfile=dbfile)
        return ncbi.annotate_tree(self,
                                  taxid_attr=taxid_attr,
                                  tax2name=tax2name,
                                  tax2track=tax2track,
                                  tax2rank=tax2rank)
Пример #3
0
def get_rank(taxid, rank_level):
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    ranks = ncbi.get_rank(lineage)
    for rank in ranks.keys():
        if ranks[rank] == rank_level:
            return rank

    return "None"
def get_family(taxid):
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    ranks = ncbi.get_rank(lineage)
    for rank in ranks.keys():
        if ranks[rank] == 'family':
            return rank

    return "None"
    def get_rank(self, rank_level):
        ncbi = NCBITaxa()
        lineage = ncbi.get_lineage(self.tax_id)
        ranks = ncbi.get_rank(lineage)
        for rank in ranks.keys():
            if ranks[rank] == rank_level:
                return rank

        return "N/A"
Пример #6
0
def getNcbiTaxonomy():
	ncbi = NCBITaxa()	
	nameToTaxIdList = ncbi.get_name_translator(ORGANISM_NAMES_LIST)
        #print (str(nameToTaxIdList))
	with open (OUTPUT_FILE, "w") as outputFile:
		for name in ORGANISM_NAMES_LIST:
		#for name, taxIds in nameToTaxIdList.items():
			taxIds = nameToTaxIdList[name]
			for eachId in taxIds:
				lineage = ncbi.get_lineage(str(eachId))
				names = ncbi.get_taxid_translator(lineage)
				outputFile.write("\t".join([names[taxid] for taxid in lineage]) + "\n")
Пример #7
0
def parseVSearchOutputAgainstNCBI(vsearch_out, database, output_file,
                                  min_coverage, min_similarity):
    """Resolves vsearch matches in a vsearch output file to the taxonomic name taken from BOLD.
        Takes in a vsearch output file from usearch__global, parses the result for good matches, and
        writes an output file mapping sequence name to taxa name.

    :param vsearch_out: An output file from vsearch's usearch__global program.
    :param database: The database used as part of the vsearch usearch__global operation.
    :param output_file: Where to write the resulting file that maps sequence ID to taxanomic name.
    :param min_coverage: The minimum coverage for an acceptable vsearch match.
    :param min_similarity: The minimum simmilarity for an acceptable vsearch match.
    """
    min_simm = float(min_similarity)
    min_coverage = float(min_coverage)
    ncbi = NCBITaxa()
    conn = sqlite3.connect(database)
    c = conn.cursor()

    query = "select taxid from gi_taxid where gi=%s"

    def getTaxFromId(taxId,
                     taxonomy=[
                         "species", "genus", 'family', 'order', 'class',
                         'phylum'
                     ]):
        myTaxonomy = dict([(a, "") for a in taxonomy])
        taxId = int(taxId)
        for lin in ncbi.get_lineage(taxId):
            rank = ncbi.get_rank([lin]).values()[0]
            if rank in taxonomy:
                val = ncbi.get_taxid_translator([lin]).values()[0]
                myTaxonomy[rank] = val

        return ":".join([myTaxonomy[x] for x in taxonomy[::-1]])

    with open(output_file, 'w') as out:
        for line in open(vsearch_out, 'r'):
            data = line.split()

            if float(data[4]) > min_coverage or float(data[2]) > min_simm:
                hit = c.execute(query % data[1]).fetchone()
                if hit:
                    taxonomy = getTaxFromId(hit[0])
                    data.append(taxonomy)
                    printVerbose("\t".join(data))
                    out.write("\t".join(data))
                    out.write("\n")
                else:
                    printErrorMissingID(out, data[1])
Пример #8
0
    def __init__(self, data_path, workbench=None, genomes=[], taxDb=None):
        self.data_path = data_path
        self.workbench = workbench
        self.metadata_path = pjoin(self.data_path, "metadata")
        if not os.path.exists(self.metadata_path):
            os.makedirs(self.metadata_path)

        self.metadata_file = pjoin(self.metadata_path, "metadata.csv")

        if taxDb:
            self.taxDb = taxDb
        else:
            self.taxDb = NCBITaxa()

        self.genomes = genomes
Пример #9
0
    def ncbi_compare(self, autodetect_duplications=True, cached_content=None):
        if not cached_content:
            cached_content = self.get_cached_content()
        cached_species = set([n.species for n in cached_content[self]])

        if len(cached_species) != len(cached_content[self]):
            print cached_species
            ntrees, ndups, target_trees = self.get_speciation_trees(
                autodetect_duplications=autodetect_duplications,
                map_features=["taxid"])
        else:
            target_trees = [self]

        ncbi = NCBITaxa()
        for t in target_trees:
            ncbi.get_broken_branches(t, cached_content)
Пример #10
0
def check_taxa_db_age(dbLocation):
    # if file doesn't exist, catch the error and run the update, as it will create the file.
    ncbi = NCBITaxa()

    try:
        filetime = datetime.fromtimestamp(path.getctime(dbLocation))
        one_month_ago = datetime.now() - timedelta(days=30)
        if filetime < one_month_ago:
            # File older than 1 month, update it:
            logInfo = '<> NCBITaxa Database older than 1 month, updating it <>'
            ncbi.update_taxonomy_database()
        else:
            logInfo = '<> NCBITaxa Database up to date <>'
    except:
        logInfo = "<> NCBITaxa Database didn't exist, downloaded it <>"
        ncbi.update_taxonomy_database()

    return(logInfo)
Пример #11
0
def main():
    args = parser.parse_args()

    cazy_fp = args.cazy_fp
    p2taxid_fp = args.p2taxid_fp
    output_fp = args.output_fp
    len_fp = args.len_fp

    ncbi = NCBITaxa()

    # read in ncbi prot to taxid, keep in memory as a dict

    prot_to_taxid = read_NCBI_prot_to_taxid_gz(p2taxid_fp)

    # open

    hmm_len = read_hmm_len_fp(len_fp)

    # read in cazy

    cazy_f = fasta_iter(cazy_fp)

    # for each cazy,
    with open(output_fp, 'w') as f:
        for header, seq in cazy_f:

            acc = header.split('|')[0]
            fam = header.split('|')[1]

            try:
                gene_length = hmm_len[fam]
            except KeyError:
                gene_length = 1000

            try:
                taxid = prot_to_taxid[acc]
                taxonomy = '.'.join(get_taxon_path(taxid, ncbi))
            except KeyError:
                taxonomy = 'unclassified'

            outline = '{0}\t{1}\t{2}\t{3}\n'.format(header, fam, gene_length,
                                                    taxonomy)

            f.write(outline)
Пример #12
0
def blast2summary_dict(db, blastpath):  # (Path, Path) -> list[dict]
    """Reading in a blast output file, lookup all seqids to get taxids with a single blastdbcmd.
  Then, lookup the taxonomy using ETE2 via the taxid, and add that info to the blast info."""
    rows = csv.DictReader(open(blastpath),
                          delimiter='\t',
                          fieldnames=[
                              'qseqid', 'sseqid', 'pid', 'alnlen', 'gapopen',
                              'qstart', 'qend', 'sstart', 'send', 'evalue',
                              'bitscore'
                          ])
    rows = list(rows)
    seqids = map(get('sseqid'), rows)
    taxids = get_taxid(db, seqids)
    gis = (s.split('|')[1] for s in seqids)
    matches = dict(
        (taxids[gi], row) for gi, row in zip(gis, rows) if gi in taxids)
    ncbi = NCBITaxa(
    )  # downloads database and creates SQLite database if needed
    return dictmap(lambda tid, row: merge(row, taxonomy(ncbi, tid)), matches)
Пример #13
0
def get_desired_ranks(taxid, desired_ranks):
    """ Gets the parent TaxID for a particular Taxon node.

        Args:
            taxid: Taxon Node whose parent TaxID at a given level needs
                   to be determined
            desired_ranks: Parent Taxon level at which Taxon ID must be
                           determined.

        Returns:
            List of TaxIDs at particular taxon level

    """
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage)
    lineage2ranks = ncbi.get_rank(names)
    ranks2lineage = dict(
        (rank, taxid) for (taxid, rank) in lineage2ranks.items())

    return [ranks2lineage.get(rank, '0') for rank in desired_ranks]
Пример #14
0
def blast2summary_dict(db, blastpath, ete2_db):  # (Path, Path) -> list[dict]
    """Reading in a blast output file, lookup all seqids to get taxids with a single blastdbcmd.
  Then, lookup the taxonomy using ETE2 via the taxid, and add that info to the blast info."""
    # rows = csv.DictReader(open(blastpath), delimiter='\t',fieldnames=[SEQID, 'sseqid','pid', 'alnlen','gapopen','qstart','qend','sstart','send','evalue','bitscore'])
    rows = csv.DictReader(open(blastpath),
                          delimiter='\t',
                          fieldnames=blast_columns)
    rows = list(rows)
    seqids = map(get('sseqid'), rows)
    taxids = get_taxid(db, seqids)

    def get_gi(s):
        fields = s.split('|')
        if len(fields) > 1:
            return fields[1]
        else:
            raise ValueError("Seq ID %s is missing GI fields and '|'" % s)

    gis = imap(get_gi, seqids)
    #TODO: change matches to use something unique--not the TAXID! actually, why is it a dict
    # in the first place? it should be a list of dictionaries, and then map over
    # the dictionaries to merge them with the taxonomy info
    # this will replace the lines:
    # matches = . . .
    # items = . . .
    #matches = dict((taxids[gi], row) for gi, row in zip(gis,rows) if gi in taxids)
    ncbi = NCBITaxa(
        ete2_db)  # downloads database and creates SQLite database if needed
    # items = dictmap(lambda tid,row: merge(row, taxonomy(ncbi, tid)), matches)
    matches = [
        assoc(row, 'taxid', taxids[gi]) for gi, row in zip(gis, rows)
        if gi in taxids
    ]
    items = [merge(row1, taxonomy(ncbi, row1['taxid'])) for row1 in matches]
    res = imap(partial(keyfilter, csv_fields.__contains__), items)
    return res
Пример #15
0
val = [color[2] for color in hsv]

ind = np.lexsort((val, sat, hue))
sorted_colors = [colors_[i] for i in ind]
colors_final = []

for i, (name, color) in enumerate(sorted_colors):
    colors_final.append(color)

import random
random.shuffle(colors_final)
colors_mapping = {}

#set up NCBI database
from ete2 import NCBITaxa
ncbi = NCBITaxa('/dfs/scratch0/manans/.etetoolkit/taxa.sqlite')

# read .nemb file
EMBEDDING_FILE = 'emb/n2v-avg.nemb'
histograms = []
species_ids = []

with open(EMBEDDING_FILE, 'r') as tf:
    for line in tf:
        ls = line.split(' ')
        species_ids.append(ls[0])
        v = []
        for n in ls[1:]:
            v.append(float(n))
        histograms.append(v)
Пример #16
0
from pandas import DataFrame
from Bio import SeqIO
from pandas import Index
from ete2 import NCBITaxa

data_path = "/home/moritz/people/MoreData/genomes/img_od1s"
img_fasta = "/home/moritz/people/MoreData/raw_imgs/od1s.fasta"
img_xls = "/home/moritz/people/MoreData/raw_imgs/od1s.xls"
name = "parcu_from_img_"
taxDb = NCBITaxa()

contigs = DataFrame.from_csv(img_xls, sep="\t", header=0, index_col=0)
manual_taxo = taxDb.get_name_translator(['Candidatus Parcubacteria'
                                         ]).values()[0][0]
metadata = {
    name + str(g): {
        'IMG_ID': g,
        'name': name + str(g),
        'species_taxid': manual_taxo,
        'long_name': contigs.loc[contigs['Genome ID'] == g]['Genome'].iloc[0]
    }
    for g in set(contigs['Genome ID'])
}

seq_dict = {k: [] for k in metadata}

with open(img_fasta, "r") as file:
    for i, c in enumerate(SeqIO.parse(file, "fasta")):
        seq_dict[name + str(contigs.iloc[i]['Genome ID'])] += [c]
Пример #17
0
import sys
import os
import urllib2
import gzip
import biom
import shutil
from numpy import random as np_rand
from ete2 import NCBITaxa
from scripts.loggingwrapper import LoggingWrapper as logger
try:
    from configparser import ConfigParser
except ImportError:
    from ConfigParser import ConfigParser

ncbi = NCBITaxa()
RANKS = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']
MAX_RANK = 'family'
_log = None

"""
Reads a BIOM file and creates map of OTU: lineage, abundance
BIOM file format needs to have a taxonomy field in metadata which contains the taxonomy in the format:
RANK__SCINAME; LOWERRANK_LOWERSCINAME
"""
def read_taxonomic_profile(biom_profile, config, no_samples = None):
    table = biom.load_table(biom_profile)
    ids = table.ids(axis="observation")
    samples = table.ids()

    if no_samples is None:
        no_samples = len(samples)
Пример #18
0
def run(args):
    # add lineage profiles/stats

    import re
    from ete2 import PhyloTree, NCBITaxa

    if not args.taxonomy and not args.info:
        args.taxonomy = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in name2tax.values()])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(
                name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" % sim

    if args.taxonomy:
        log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids)))
        t = ncbi.get_topology(all_taxids.keys(),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))
        dump(t,
             features=[
                 "taxid", "name", "rank", "bgcolor", "sci_name",
                 "collapse_subspecies", "named_lineage"
             ])
    elif args.info:
        print '# ' + '\t'.join(
            ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print '\t'.join([
                str(taxid), name,
                ranks.get(taxid, ''), named_lineage, lineage_string
            ])
Пример #19
0
def get_name(taxid):
    ncbi = NCBITaxa()
    names = ncbi.get_taxid_translator([taxid])
    return names[taxid]
Пример #20
0
def taxo_msa(outfile='taxo_msa.svg',
             taxids=[],
             annotation='',
             msa=[],
             title='',
             width=2000):
    """
    Visualize MSA together with a taxonomy tree
    taxids - list of taxids in the same order as seqs in msa
    """
    # taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    # gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    # msa_dict={i.id:i.seq for i in msa_tr}
    ncbi = NCBITaxa()
    taxids = map(int, taxids)

    t = ncbi.get_topology(taxids, intermediate_nodes=False)
    a = t.add_child(name='annotation')
    a.add_feature('sci_name', 'annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()

    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if (node.rank in ['order', 'class', 'phylum', 'kingdom']):
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name == 'annotation':
            s = str(msa[taxids.index(int(node.name))].seq)
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            # gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' ' +
                                      msa[taxids.index(int(node.name))].id),
                             node,
                             column=1,
                             position="aligned")
            # add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name == 'annotation':
            if (annotation):
                s = annotation
                # get_hist_ss_in_aln_as_string(msa_tr)
            else:
                s = ' ' * len(msa[0].seq)
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' ' + 'SEQ_ID'),
                             node,
                             column=1,
                             position="aligned")
            # add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")

    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render(outfile, w=width, dpi=300, tree_style=ts)
Пример #21
0
def taxo_seq_architecture(seqreclist=[],
                          outfile='taxo_arch.svg',
                          taxids=[],
                          annotation='',
                          title='',
                          width=2000):
    """
    Visualize sequence architecture together with a taxonomy tree
    seqreclist - contains a list of seqres.
    each seqrec should have a list of features in biobython SeqFeature format.

    features of type "domain" will be plotted as boxes 
    features of type "xxxx" will be plotted as ...

    taxids - list of taxids in the same order as seqs in msa, if now provided will assume that seqrecs
    are in genbank format and attempt to get taxids from there.
    """
    aa = [
        'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F',
        'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '.', '-'
    ]

    def get_color(str):
        colorlist = [
            'red', 'green', 'yellow', 'lightblue', 'cyan', 'magenta', 'orange',
            'pink', 'lightgreen'
        ]
        return colorlist[hash(str) % 9]

    if len(taxids) == 0:
        taxids = map(get_taxid_from_gbrec, seqreclist)

    ncbi = NCBITaxa()
    taxids = map(int, taxids)

    t = ncbi.get_topology(taxids, intermediate_nodes=False)
    # a=t.add_child(name='annotation')
    # a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()

    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if (node.rank in ['order', 'class', 'phylum', 'kingdom']):
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name == 'annotation':
            #here we are adding faces and we need to play with seqmotif face
            seq = str(seqreclist[taxids.index(int(node.name))].seq)
            motifs = []  #[[0,len(seq), "seq", 10, 10, None, None, None]]
            for f in seqreclist[taxids.index(int(node.name))].features:
                if f.type == 'domain':
                    motifs.append([
                        f.location.start, f.location.end, "[]", None, 10,
                        "blue",
                        get_color(f.qualifiers['name']),
                        "arial|8|black|%s" % f.qualifiers['name']
                    ])
                if f.type == 'motif':
                    #It turns out that we need to solve overlap problem here, here it is solved only in case of one overlap
                    s = f.location.start
                    e = f.location.end
                    flag = True
                    overlappedm = []
                    for m in motifs:
                        if m[2] == 'seq' and m[0] < e and m[
                                1] > s:  #we have an overlap, four cases, preceding motife always is on top
                            flag = False
                            overlappedm.append(m)
                    if not flag:  #we have to solve multiple overlap problem
                        #let's do it by scanning
                        sflag = False
                        eflag = False
                        for x in range(s, e + 1):
                            if not sflag:  #check if we can start
                                overlap = False
                                for m in overlappedm:
                                    if x >= m[0] and x < m[1]:
                                        overlap = True
                                if not overlap:
                                    ts = x
                                    sflag = True

                            #check if is time to end
                            if sflag and not eflag:
                                overlap = False
                                for m in overlappedm:
                                    if x == m[0]:
                                        overlap = True
                                if overlap or x == e:
                                    te = x
                                    eflag = True

                            if sflag and eflag:
                                motifs.append([
                                    ts, te, "seq", 10, 10, "black",
                                    get_color(f.qualifiers['name']), None
                                ])
                                sflag = False
                                eflag = False
                    if flag:
                        motifs.append([
                            f.location.start, f.location.end, "seq", 10, 10,
                            "black",
                            get_color(f.qualifiers['name']), None
                        ])
            seqFace = SeqMotifFace(seq,
                                   motifs,
                                   scale_factor=1,
                                   seq_format="[]")
            seqFace.overlaping_motif_opacity = 1.0
            # seqFace.fg=aafgcolors
            # seqFace.bg=aabgcolors_gray

            add_face_to_node(seqFace, node, 0, position="aligned")
            # gi=taxid2gi[int(node.name)]
            add_face_to_node(
                TextFace(' ' + seqreclist[taxids.index(int(node.name))].id +
                         '         '),
                node,
                column=1,
                position="aligned")
            # add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        #We currently disable annotation
        if node.is_leaf() and node.name == 'annotation':
            if (annotation):
                s = annotation
                # get_hist_ss_in_aln_as_string(msa_tr)
            else:
                s = ' ' * max(map(lambda x: len(x.seq), seqreclist))
            # seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            # add_face_to_node(seqFace, node, 0, position="aligned")
            # add_face_to_node(TextFace(' '+'SEQ_ID'),node,column=1, position = "aligned")

            # add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")

    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render(outfile, w=width, dpi=300, tree_style=ts)
Пример #22
0
def main():
    args = parser.parse_args()

    input_fp = args.input_fp
    tdt_out_fp = args.tdt_out_fp
    h2gt_out_fp = args.h2gt_out_fp
    rank_headers = args.rank_headers
    ranks = args.ranks

    ncbi = NCBITaxa()

    # input_fp = './R1_trimmed_CAT_rare_genefamilies_cpm_ko.tsv'

    h2gt = read_humann2_genetable_generator(open(input_fp))

    rank_headers = rank_headers.split(',')
    ranks = ranks.split(',')

    tax_dict = {}

    if tdt_out_fp:
        tdt_out_f = open(tdt_out_fp, 'w')
    if h2gt_out_fp:
        h2gt_out_f = open(h2gt_out_fp, 'w')

    first_h = True
    first_t = True
    for gene, header, line, tax in h2gt:
        lineage = list(rank_headers)

        if tax and tax not in tax_dict:
            best_id = None

            family, genus, species = clean_humann2_taxon(tax)
            best_id = get_best_ncbi_id(family, genus, species, ncbi)

            if best_id is not None:
                lineage = get_taxon_path(best_id,
                                         ncbi,
                                         ranks=ranks,
                                         rank_headers=rank_headers)

            tax_dict[tax] = lineage

        if tax:
            lineage = tax_dict[tax]

        if tdt_out_fp:
            if first_t:
                first_t = False
                tdt_out_f.write('Gene Family\t{0}\t{1}\n'.format(
                    '\t'.join(header), '\t'.join(ranks)))
            elif tax:
                tdt_out_f.write('{0}\t{1}\t{2}\n'.format(
                    gene, '\t'.join(line), '\t'.join(lineage)))

        if h2gt_out_fp:
            if first_h:
                h2gt_out_f.write('# Gene Family\t{0}\n'.format(
                    '\t'.join(header)))
                first_h = False
            if tax:
                if lineage == rank_headers:
                    h2gt_out_f.write('{0}|{1}\t{2}\n'.format(
                        gene, 'unknown', '\t'.join(line)))
                else:
                    h2gt_out_f.write('{0}|{1}\t{2}\n'.format(
                        gene, '.'.join(lineage), '\t'.join(line)))
            else:
                h2gt_out_f.write('{0}\t{1}\n'.format(gene, '\t'.join(line)))

    if tdt_out_fp:
        tdt_out_f.close()
    if h2gt_out_f:
        h2gt_out_f.close()