Exemplo n.º 1
0
def download(args):
    """Downloads fasta or taxonomy dump files

    If args.db == 'taxonomy', download taxonomy dump files from ncbi and initialize the ete3 sqlite database
    If args.db == 'idmap', download the seqid->taxid mapfile from ncbi
    Otherwise download the protein fastafile corresponding to args.db (uniref50, uniref90, uniref100 or nr)
    """

    if args.db == "taxonomy":
        prepare.download_ncbi_taxonomy(args.taxdir, args.force)
        prepare.init_sqlite_taxdb(args.taxdir, args.sqlitedb, args.force)
    elif args.db == "idmap":
        prepare.download_nr_idmap(args.dldir, args.tmpdir, args.force)
    else:
        prepare.download_fasta(args.dldir, args.db, args.tmpdir, args.force,
                               args.skip_check, args.skip_idmap)
Exemplo n.º 2
0
def make_lineage_df(taxids, taxdir, dbname, ranks, cpus=1):
    """
    Creates a lineage dataframe with full taxonomic information for a list of taxids.

    Example:
    taxid   species phylum  genus   genus.name      phylum.name     species.name
    859655  305     1224    48736   Ralstonia       Proteobacteria  Ralstonia solanacearum
    387344  1580    1239    1578    Lactobacillus   Firmicutes      Lactobacillus brevis
    358681  1393    1239    55080   Brevibacillus   Firmicutes      Brevibacillus brevis

    Parameters
    ----------
    taxids: list
        List of taxonomic ids to obtain information for
    taxdir: str
        Path to directory holding taxonomic info
    dbname: str
        Name of ete3 sqlite database within taxdir
    ranks: list
        Ranks to store information for
    cpus: int
        Number of cpus to use

    Returns
    -------
    lineage_df: pandas.DataFrame
        Data Frame with full taxonomic info
    """
    # Read the taxonomy db
    ncbi_taxa = init_sqlite_taxdb(taxdir, dbname)
    lineages = ncbi_taxa.get_lineage_translator(taxids)
    # Store potential missing taxids and warn user
    missing_taxids = set([int(x) for x in taxids]).difference(lineages.keys())
    # Get possible translations for taxids that have been changed
    _, translate_dict = ncbi_taxa._translate_merged(list(set(taxids).difference(lineages.keys())))
    rename = {y: x for x, y in translate_dict.items()}
    # Update lineages with missing taxids
    lineages.update(ncbi_taxa.get_lineage_translator(translate_dict.values()))
    items = [[taxid, ranks, taxdir, dbname, lineages[taxid]] for taxid in list(lineages.keys())]
    with Pool(processes=cpus) as pool:
        res = list(
            tqdm.tqdm(pool.imap(process_lineages, items), desc="Making lineages", total=len(items),
                      unit=" taxids", ncols=100))
    lineage_df = pd.concat(res, sort=False)
    lineage_df.rename(index=rename, inplace=True)
    lineage_df.rename(index=lambda x: int(x), inplace=True)
    for rank in ranks:
        lineage_df[rank] = pd.to_numeric(lineage_df[rank])
    name_dict = make_name_dict(lineage_df, ranks)
    if len(missing_taxids) > 0:
        sys.stderr.write("#WARNING: Missing taxids found:\n")
        sys.stderr.write("#{}\n".format(",".join([str(x) for x in missing_taxids])))
        sys.stderr.write("#To fix this, you can try to update the taxonomy database using\n")
        sys.stderr.write("#tango download taxonomy --force\n")
    return lineage_df.loc[:,lineage_df.dtypes==int], name_dict
Exemplo n.º 3
0
def process_lineages(items):
    """
    Looks up lineage information from taxids.

    The lineage object is a list of taxonomic ids corresponding to the full lineage of a single taxid.
    """
    taxid, ranks, taxdir, dbname, lineage = items
    # Read the taxonomy db
    ncbi_taxa = init_sqlite_taxdb(taxdir, dbname)
    # Get ranks for each taxid in the lineage
    lineage_ranks = ncbi_taxa.get_rank(lineage)
    x = pd.DataFrame(lineage_ranks, index=["rank"]).T
    x = x.loc[x["rank"].isin(ranks)].reset_index().T
    x.columns = x.loc["rank"]
    x.drop("rank", inplace=True)
    x.index = [taxid]
    # Add taxids for lower ranks in the hierarchy
    x = propagate_lower(x, taxid, ranks)
    # Add names for taxids
    x = add_names(x, taxid, ncbi_taxa)
    return x