예제 #1
0
def build_taxmap(features, target_rank, lca_choice, p_tol_lca):
    # pull out relevant uniref ids (avoids full uniref taxonomy in memory)
    unirefs = {util.fsplit(k)[0] for k in features}
    unirefs = {k for k in unirefs if "UniRef" in k}
    # tree-of-life object
    tol = TreeOfLife()
    # mapping from uniref id to taxon sciname or unclassified
    taxmap = {}
    tol_mode = False
    lca_mode = False
    headers = None
    with util.try_zip_open(p_tol_lca) as fh:
        print("  Loading taxonomic data from:", p_tol_lca, file=sys.stderr)
        for row in csv.reader(fh, csv.excel_tab):
            # determine which parsing mode we're in
            if row[0] == c_tol_header:
                print("  Loading TOL data", file=sys.stderr)
                tol_mode = True
                headers = row
                continue
            if row[0] == c_lca_header:
                print("  Loading LCA data", file=sys.stderr)
                tol_mode = False
                lca_mode = True
                headers = row
                continue
            # row is a taxon, add to tree
            if tol_mode:
                R = rowdict(headers, row)
                tol.add_node(Taxon(R))
            # row is a uniref lca entry, add to taxmap if relevant
            elif lca_mode:
                R = rowdict(headers, row)
                u = R["FAMILY"]
                if u in unirefs:
                    lca = R[lca_choice.upper()]
                    for rank, sciname in tol.get_lineage(lca):
                        if rank == target_rank:
                            taxmap[u] = rank.lower()[0] + "__" + simplify(
                                sciname)
                            break
    # augment taxmap with genus-level lineage information for stratified features
    for feature in features:
        feature, name, stratum = util.fsplit(feature)
        if stratum is not None and "g__" in stratum:
            genus = stratum.split(util.c_taxon_delim)[0]
            if target_rank == "Genus":
                # directly assign stratified genus
                taxmap[stratum] = genus
            else:
                # try to find genus ancestor based on name
                genus = genus.replace("g__", "")
                taxid = tol.get_genus_taxid(genus)
                if taxid is not None:
                    for rank, sciname in tol.get_lineage(taxid):
                        if rank == target_rank:
                            taxmap[stratum] = rank.lower(
                            )[0] + "__" + simplify(sciname)
                            break
    return taxmap
예제 #2
0
def regroup( table, map_feature_groups, function, precision, ungrouped=False ):
    
    function = c_funcmap[function]
    seen_before = {}
    feature_counts = {}
    # index of new group names to old table rows
    mapping = {}

    for i, rowhead in enumerate( table.rowheads ):
        feature, name, stratum = util.fsplit( rowhead )
        if feature not in feature_counts:
            feature_counts[feature] = 0
        # decide which groups to use
        if feature in map_feature_groups:
            groups = map_feature_groups[feature]
        elif ungrouped:
            groups = [util.c_ungrouped]
        else:
            groups = []
        # track grouping
        for group in groups:
            if feature not in seen_before and group != util.c_ungrouped:
                feature_counts[feature] += 1
            # account for stratified feature
            groupname = group 
            if stratum is not None:
                groupname = util.fjoin( groupname, stratum=stratum )
            mapping.setdefault( groupname, [] ).append( i )
        # we have processed an instance of this feature
        seen_before[feature] = 1

    # rebuild table
    groupnames = util.fsort( mapping.keys( ) )
    groupdata = []
    for groupname in groupnames:
        oldrow_index = mapping[groupname]
        newrow = [[] for j in range( len( table.colheads ) )]
        for i in oldrow_index:
            for j in range( len( table.colheads ) ):
                newrow[j].append( float( table.data[i][j] ) )
        # collapse groups
        newrow = [function( block ) for block in newrow]
        if precision is not None:
            newrow = [round( k, precision ) for k in newrow]
        groupdata.append( newrow )
    table.rowheads = groupnames
    table.data = groupdata

    # report
    n = len( feature_counts )
    ungrouped = list( feature_counts.values( ) ).count( 0 )
    grouped_total = n - ungrouped
    grouped_multi = grouped_total - list(feature_counts.values()).count( 1 )
    print( "Original Feature Count: %d; Grouped 1+ times: %d (%.1f%%); Grouped 2+ times: %d (%.1f%%)" % \
           ( n,
             grouped_total,
             100 * grouped_total / float( n ),
             grouped_multi,
             100 * grouped_multi / float( n ),
         ), file=sys.stderr )
예제 #3
0
def tax_connect(feature, taxmap):
    old = feature
    feature, name, stratum = util.fsplit(feature)
    if stratum is None or stratum == c_unclassified:
        stratum2 = taxmap.get(feature, c_unclassified)
    else:
        stratum2 = taxmap.get(stratum, c_unclassified)
    return util.fjoin(feature, name, stratum2)
예제 #4
0
def build_taxmap(features, target_rank, p_datafile):
    unirefs = {util.fsplit(k)[0] for k in features}
    unirefs = {k for k in unirefs if "UniRef" in k}
    # load tree of life, subset uniref lca annotation and add to taxmap
    tol = TreeOfLife()
    taxmap = {}
    tol_mode = False
    lca_mode = False
    with util.try_zip_open(p_datafile) as fh:
        print("Loading taxonomic data from: " + p_datafile, file=sys.stderr)
        for row in csv.reader(fh, csv.excel_tab):
            if row[0] == c_tol_header:
                print("  Loading TOL data", file=sys.stderr)
                tol_mode = True
                continue
            if row[0] == c_lca_header:
                print("  Loading LCA data", file=sys.stderr)
                tol_mode = False
                lca_mode = True
                continue
            if tol_mode:
                tol.attach(Taxon(*row))
            elif lca_mode:
                uni, lca = row
                if uni in unirefs:
                    for rank, common in tol.get_lineage(lca):
                        if rank == target_rank:
                            taxmap[uni] = rank.lower()[0] + "__" + simplify(
                                common)
                            break
    # augment taxmap with genus-level lineage information for stratified features
    for feature in features:
        feature, name, stratum = util.fsplit(feature)
        if stratum is not None and "g__" in stratum:
            genus = stratum.split(util.c_taxon_delim)[0]
            if target_rank == "Genus":
                taxmap[stratum] = genus
            else:
                genus = genus.replace("g__", "")
                for rank, common in tol.get_lineage(genus):
                    if rank == target_rank:
                        taxmap[stratum] = rank.lower()[0] + "__" + simplify(
                            common)
                        break
    return taxmap
예제 #5
0
    def __init__(self,
                 path,
                 focal_feature=None,
                 last_metadata=None,
                 focal_metadata=None,
                 exclude_unclassified=False):

        # table features
        self.colheads = None
        self.rowheads = []
        self.data = []
        self.metarow = None
        self.focus_name = None
        IN_FEATURES = False

        # pull relevant rows from input table
        for row in tsv_reader(path):
            rowhead, values = row[0], row[1:]
            if self.colheads is None:
                self.colheads = values
                continue
            # ****focal meta and last meta can be the same thing****
            if focal_metadata is not None and rowhead == focal_metadata:
                self.metarow = values
            if last_metadata is not None and rowhead == last_metadata:
                IN_FEATURES = True
            if last_metadata is None or IN_FEATURES:
                code, name, stratum = util.fsplit(rowhead)
                if code == focal_feature and stratum is not None:
                    if stratum != c_unclassified_str or not exclude_unclassified:
                        self.focus_name = util.fjoin(code, name)
                        self.rowheads.append(stratum)
                        self.data.append([float(k) for k in values])

        # check that we found something
        if self.focus_name is None:
            util.die(
                "Requested feature <{}> was missing or not stratified".format(
                    focal_feature))

        # update the table
        self.data = np.array(self.data)
        self.update()
예제 #6
0
def main():

    args = get_args()
    T = util.Table(args.input)

    # build the taxmap (uniref -> taxon mapping)
    print("Building taxonomic map for input table", file=sys.stderr)
    if args.devdb is not None:
        p_datafile = args.devdb
    elif args.database in databases:
        p_datafile = databases[args.database]
    else:
        sys.exit(
            "Must specify a valid database (from utility mapping or --devdb)")
    taxmap = build_taxmap(T.rowheads, args.level, args.lca_choice, p_datafile)

    # refine the taxmap (remove rare taxa)
    counts = Counter(taxmap.values())
    total = float(sum(counts.values()))
    counts = {k: v / total for k, v in counts.items()}
    taxmap = {
        old: new
        for old, new in taxmap.items() if counts[new] >= args.threshold
    }

    # reindex the table (which rows to keep, which rows to pseudo-stratify)
    print("Reindexing the input table", file=sys.stderr)
    ticker = util.Ticker(T.rowheads)
    index = {}
    for i, rowhead in enumerate(T.rowheads):
        ticker.tick()
        feature, name, stratum = util.fsplit(rowhead)
        new_rowhead = tax_connect(rowhead, taxmap)
        # unmapped is never stratified
        if feature == util.c_unmapped:
            index.setdefault(rowhead, []).append(i)
        # outside of unclassified mode, keep totals
        elif stratum is None and args.mode != c_umode:
            index.setdefault(rowhead, []).append(i)
            # in totals mode, guess at taxonomy from uniref name
            if args.mode == c_tmode:
                index.setdefault(new_rowhead, []).append(i)
        # in unclassified mode, make a new row for the total...
        elif stratum == c_unclassified and args.mode == c_umode:
            index.setdefault(util.fjoin(feature, name, None), []).append(i)
            # ...then replace "unclassified" with inferred taxonomy
            index.setdefault(new_rowhead, []).append(i)
        # update strata in stratified mode
        elif stratum is not None and args.mode == c_smode:
            index.setdefault(new_rowhead, []).append(i)

    # rebuild the table
    print("Rebuilding the input table", file=sys.stderr)
    rowheads2, data2 = [], []
    ticker = util.Ticker(index)
    for rowhead in util.fsort(index):
        ticker.tick()
        rowheads2.append(rowhead)
        newrow = [0 for k in T.colheads]
        for i in index[rowhead]:
            oldrow = map(float, T.data[i])
            newrow = [a + b for a, b in zip(newrow, oldrow)]
        data2.append(newrow)
    T.rowheads = rowheads2
    T.data = data2
    print("Writing new table", file=sys.stderr)
    T.write(args.output, unfloat=True)

    # report on performance
    success, total = 0, 0
    for rowhead in T.rowheads:
        feature, name, stratum = util.fsplit(rowhead)
        if stratum is not None:
            total += 1
            if stratum != c_unclassified:
                success += 1
    print(
        "Summary: Of {TOTAL} stratifications, {SUCCESS} mapped at {TARGET} level ({PERCENT}%)"
        .format(
            TOTAL=total,
            SUCCESS=success,
            TARGET=args.level,
            PERCENT=round(100 * success / float(total), 1),
        ),
        file=sys.stderr,
    )
예제 #7
0
def main():
    args = get_args()
    tbl = util.Table(args.input)
    # build the taxmap
    print("Building taxonomic map for input table", file=sys.stderr)
    p_datafile = args.dev if args.dev is not None else databases[
        args.resolution]
    taxmap = build_taxmap(tbl.rowheads, args.level, p_datafile)
    # refine the taxmap
    counts = {}
    for old, new in taxmap.items():
        counts[new] = counts.get(new, 0) + 1
    total = float(sum(counts.values()))
    count = {k: v / total for k, v in counts.items()}
    taxmap = {
        old: new
        for old, new in taxmap.items() if count[new] >= args.threshold
    }
    # reindex the table
    print("Reindexing the input table", file=sys.stderr)
    ticker = util.Ticker(tbl.rowheads)
    index = {}
    for i, rowhead in enumerate(tbl.rowheads):
        ticker.tick()
        feature, name, stratum = util.fsplit(rowhead)
        new_rowhead = tax_connect(rowhead, taxmap)
        # unmapped is never stratified
        if feature == util.c_unmapped:
            index.setdefault(rowhead, []).append(i)
        # outside of unclassified mode, keep totals
        elif stratum is None and args.mode != c_umode:
            index.setdefault(rowhead, []).append(i)
            # in totals mode, guess at taxonomy from uniref name
            if args.mode == c_tmode:
                index.setdefault(new_rowhead, []).append(i)
        elif stratum == c_unclassified and args.mode == c_umode:
            # in unclassified mode, make a new row for the total...
            index.setdefault(util.fjoin(feature, name, None), []).append(i)
            # ...then replace "unclassified" with inferred taxonomy
            index.setdefault(new_rowhead, []).append(i)
        elif stratum is not None and args.mode == c_smode:
            index.setdefault(new_rowhead, []).append(i)
    # rebuild the table
    print("Rebuilding the input table", file=sys.stderr)
    rowheads2, data2 = [], []
    ticker = util.Ticker(index)
    for rowhead in util.fsort(index):
        ticker.tick()
        rowheads2.append(rowhead)
        newrow = [0 for k in tbl.colheads]
        for i in index[rowhead]:
            oldrow = map(float, tbl.data[i])
            newrow = [a + b for a, b in zip(newrow, oldrow)]
        data2.append(newrow)
    tbl.rowheads = rowheads2
    tbl.data = data2
    # output
    print("Writing new table", file=sys.stderr)
    tbl.write(args.output, unfloat=True)
    # report on performance
    success, total = 0, 0
    for rowhead in tbl.rowheads:
        feature, name, stratum = util.fsplit(rowhead)
        if stratum is not None:
            total += 1
            if stratum != c_unclassified:
                success += 1
    print(
        "Summary: Of {TOTAL} stratifications, {SUCCESS} mapped at {TARGET} level ({PERCENT}%)"
        .format(
            TOTAL=total,
            SUCCESS=success,
            TARGET=args.level,
            PERCENT=round(100 * success / float(total), 1),
        ),
        file=sys.stderr,
    )