def build_taxmap(features, target_rank, lca_choice, p_tol_lca): # pull out relevant uniref ids (avoids full uniref taxonomy in memory) unirefs = {util.fsplit(k)[0] for k in features} unirefs = {k for k in unirefs if "UniRef" in k} # tree-of-life object tol = TreeOfLife() # mapping from uniref id to taxon sciname or unclassified taxmap = {} tol_mode = False lca_mode = False headers = None with util.try_zip_open(p_tol_lca) as fh: print(" Loading taxonomic data from:", p_tol_lca, file=sys.stderr) for row in csv.reader(fh, csv.excel_tab): # determine which parsing mode we're in if row[0] == c_tol_header: print(" Loading TOL data", file=sys.stderr) tol_mode = True headers = row continue if row[0] == c_lca_header: print(" Loading LCA data", file=sys.stderr) tol_mode = False lca_mode = True headers = row continue # row is a taxon, add to tree if tol_mode: R = rowdict(headers, row) tol.add_node(Taxon(R)) # row is a uniref lca entry, add to taxmap if relevant elif lca_mode: R = rowdict(headers, row) u = R["FAMILY"] if u in unirefs: lca = R[lca_choice.upper()] for rank, sciname in tol.get_lineage(lca): if rank == target_rank: taxmap[u] = rank.lower()[0] + "__" + simplify( sciname) break # augment taxmap with genus-level lineage information for stratified features for feature in features: feature, name, stratum = util.fsplit(feature) if stratum is not None and "g__" in stratum: genus = stratum.split(util.c_taxon_delim)[0] if target_rank == "Genus": # directly assign stratified genus taxmap[stratum] = genus else: # try to find genus ancestor based on name genus = genus.replace("g__", "") taxid = tol.get_genus_taxid(genus) if taxid is not None: for rank, sciname in tol.get_lineage(taxid): if rank == target_rank: taxmap[stratum] = rank.lower( )[0] + "__" + simplify(sciname) break return taxmap
def regroup( table, map_feature_groups, function, precision, ungrouped=False ): function = c_funcmap[function] seen_before = {} feature_counts = {} # index of new group names to old table rows mapping = {} for i, rowhead in enumerate( table.rowheads ): feature, name, stratum = util.fsplit( rowhead ) if feature not in feature_counts: feature_counts[feature] = 0 # decide which groups to use if feature in map_feature_groups: groups = map_feature_groups[feature] elif ungrouped: groups = [util.c_ungrouped] else: groups = [] # track grouping for group in groups: if feature not in seen_before and group != util.c_ungrouped: feature_counts[feature] += 1 # account for stratified feature groupname = group if stratum is not None: groupname = util.fjoin( groupname, stratum=stratum ) mapping.setdefault( groupname, [] ).append( i ) # we have processed an instance of this feature seen_before[feature] = 1 # rebuild table groupnames = util.fsort( mapping.keys( ) ) groupdata = [] for groupname in groupnames: oldrow_index = mapping[groupname] newrow = [[] for j in range( len( table.colheads ) )] for i in oldrow_index: for j in range( len( table.colheads ) ): newrow[j].append( float( table.data[i][j] ) ) # collapse groups newrow = [function( block ) for block in newrow] if precision is not None: newrow = [round( k, precision ) for k in newrow] groupdata.append( newrow ) table.rowheads = groupnames table.data = groupdata # report n = len( feature_counts ) ungrouped = list( feature_counts.values( ) ).count( 0 ) grouped_total = n - ungrouped grouped_multi = grouped_total - list(feature_counts.values()).count( 1 ) print( "Original Feature Count: %d; Grouped 1+ times: %d (%.1f%%); Grouped 2+ times: %d (%.1f%%)" % \ ( n, grouped_total, 100 * grouped_total / float( n ), grouped_multi, 100 * grouped_multi / float( n ), ), file=sys.stderr )
def tax_connect(feature, taxmap): old = feature feature, name, stratum = util.fsplit(feature) if stratum is None or stratum == c_unclassified: stratum2 = taxmap.get(feature, c_unclassified) else: stratum2 = taxmap.get(stratum, c_unclassified) return util.fjoin(feature, name, stratum2)
def build_taxmap(features, target_rank, p_datafile): unirefs = {util.fsplit(k)[0] for k in features} unirefs = {k for k in unirefs if "UniRef" in k} # load tree of life, subset uniref lca annotation and add to taxmap tol = TreeOfLife() taxmap = {} tol_mode = False lca_mode = False with util.try_zip_open(p_datafile) as fh: print("Loading taxonomic data from: " + p_datafile, file=sys.stderr) for row in csv.reader(fh, csv.excel_tab): if row[0] == c_tol_header: print(" Loading TOL data", file=sys.stderr) tol_mode = True continue if row[0] == c_lca_header: print(" Loading LCA data", file=sys.stderr) tol_mode = False lca_mode = True continue if tol_mode: tol.attach(Taxon(*row)) elif lca_mode: uni, lca = row if uni in unirefs: for rank, common in tol.get_lineage(lca): if rank == target_rank: taxmap[uni] = rank.lower()[0] + "__" + simplify( common) break # augment taxmap with genus-level lineage information for stratified features for feature in features: feature, name, stratum = util.fsplit(feature) if stratum is not None and "g__" in stratum: genus = stratum.split(util.c_taxon_delim)[0] if target_rank == "Genus": taxmap[stratum] = genus else: genus = genus.replace("g__", "") for rank, common in tol.get_lineage(genus): if rank == target_rank: taxmap[stratum] = rank.lower()[0] + "__" + simplify( common) break return taxmap
def __init__(self, path, focal_feature=None, last_metadata=None, focal_metadata=None, exclude_unclassified=False): # table features self.colheads = None self.rowheads = [] self.data = [] self.metarow = None self.focus_name = None IN_FEATURES = False # pull relevant rows from input table for row in tsv_reader(path): rowhead, values = row[0], row[1:] if self.colheads is None: self.colheads = values continue # ****focal meta and last meta can be the same thing**** if focal_metadata is not None and rowhead == focal_metadata: self.metarow = values if last_metadata is not None and rowhead == last_metadata: IN_FEATURES = True if last_metadata is None or IN_FEATURES: code, name, stratum = util.fsplit(rowhead) if code == focal_feature and stratum is not None: if stratum != c_unclassified_str or not exclude_unclassified: self.focus_name = util.fjoin(code, name) self.rowheads.append(stratum) self.data.append([float(k) for k in values]) # check that we found something if self.focus_name is None: util.die( "Requested feature <{}> was missing or not stratified".format( focal_feature)) # update the table self.data = np.array(self.data) self.update()
def main(): args = get_args() T = util.Table(args.input) # build the taxmap (uniref -> taxon mapping) print("Building taxonomic map for input table", file=sys.stderr) if args.devdb is not None: p_datafile = args.devdb elif args.database in databases: p_datafile = databases[args.database] else: sys.exit( "Must specify a valid database (from utility mapping or --devdb)") taxmap = build_taxmap(T.rowheads, args.level, args.lca_choice, p_datafile) # refine the taxmap (remove rare taxa) counts = Counter(taxmap.values()) total = float(sum(counts.values())) counts = {k: v / total for k, v in counts.items()} taxmap = { old: new for old, new in taxmap.items() if counts[new] >= args.threshold } # reindex the table (which rows to keep, which rows to pseudo-stratify) print("Reindexing the input table", file=sys.stderr) ticker = util.Ticker(T.rowheads) index = {} for i, rowhead in enumerate(T.rowheads): ticker.tick() feature, name, stratum = util.fsplit(rowhead) new_rowhead = tax_connect(rowhead, taxmap) # unmapped is never stratified if feature == util.c_unmapped: index.setdefault(rowhead, []).append(i) # outside of unclassified mode, keep totals elif stratum is None and args.mode != c_umode: index.setdefault(rowhead, []).append(i) # in totals mode, guess at taxonomy from uniref name if args.mode == c_tmode: index.setdefault(new_rowhead, []).append(i) # in unclassified mode, make a new row for the total... elif stratum == c_unclassified and args.mode == c_umode: index.setdefault(util.fjoin(feature, name, None), []).append(i) # ...then replace "unclassified" with inferred taxonomy index.setdefault(new_rowhead, []).append(i) # update strata in stratified mode elif stratum is not None and args.mode == c_smode: index.setdefault(new_rowhead, []).append(i) # rebuild the table print("Rebuilding the input table", file=sys.stderr) rowheads2, data2 = [], [] ticker = util.Ticker(index) for rowhead in util.fsort(index): ticker.tick() rowheads2.append(rowhead) newrow = [0 for k in T.colheads] for i in index[rowhead]: oldrow = map(float, T.data[i]) newrow = [a + b for a, b in zip(newrow, oldrow)] data2.append(newrow) T.rowheads = rowheads2 T.data = data2 print("Writing new table", file=sys.stderr) T.write(args.output, unfloat=True) # report on performance success, total = 0, 0 for rowhead in T.rowheads: feature, name, stratum = util.fsplit(rowhead) if stratum is not None: total += 1 if stratum != c_unclassified: success += 1 print( "Summary: Of {TOTAL} stratifications, {SUCCESS} mapped at {TARGET} level ({PERCENT}%)" .format( TOTAL=total, SUCCESS=success, TARGET=args.level, PERCENT=round(100 * success / float(total), 1), ), file=sys.stderr, )
def main(): args = get_args() tbl = util.Table(args.input) # build the taxmap print("Building taxonomic map for input table", file=sys.stderr) p_datafile = args.dev if args.dev is not None else databases[ args.resolution] taxmap = build_taxmap(tbl.rowheads, args.level, p_datafile) # refine the taxmap counts = {} for old, new in taxmap.items(): counts[new] = counts.get(new, 0) + 1 total = float(sum(counts.values())) count = {k: v / total for k, v in counts.items()} taxmap = { old: new for old, new in taxmap.items() if count[new] >= args.threshold } # reindex the table print("Reindexing the input table", file=sys.stderr) ticker = util.Ticker(tbl.rowheads) index = {} for i, rowhead in enumerate(tbl.rowheads): ticker.tick() feature, name, stratum = util.fsplit(rowhead) new_rowhead = tax_connect(rowhead, taxmap) # unmapped is never stratified if feature == util.c_unmapped: index.setdefault(rowhead, []).append(i) # outside of unclassified mode, keep totals elif stratum is None and args.mode != c_umode: index.setdefault(rowhead, []).append(i) # in totals mode, guess at taxonomy from uniref name if args.mode == c_tmode: index.setdefault(new_rowhead, []).append(i) elif stratum == c_unclassified and args.mode == c_umode: # in unclassified mode, make a new row for the total... index.setdefault(util.fjoin(feature, name, None), []).append(i) # ...then replace "unclassified" with inferred taxonomy index.setdefault(new_rowhead, []).append(i) elif stratum is not None and args.mode == c_smode: index.setdefault(new_rowhead, []).append(i) # rebuild the table print("Rebuilding the input table", file=sys.stderr) rowheads2, data2 = [], [] ticker = util.Ticker(index) for rowhead in util.fsort(index): ticker.tick() rowheads2.append(rowhead) newrow = [0 for k in tbl.colheads] for i in index[rowhead]: oldrow = map(float, tbl.data[i]) newrow = [a + b for a, b in zip(newrow, oldrow)] data2.append(newrow) tbl.rowheads = rowheads2 tbl.data = data2 # output print("Writing new table", file=sys.stderr) tbl.write(args.output, unfloat=True) # report on performance success, total = 0, 0 for rowhead in tbl.rowheads: feature, name, stratum = util.fsplit(rowhead) if stratum is not None: total += 1 if stratum != c_unclassified: success += 1 print( "Summary: Of {TOTAL} stratifications, {SUCCESS} mapped at {TARGET} level ({PERCENT}%)" .format( TOTAL=total, SUCCESS=success, TARGET=args.level, PERCENT=round(100 * success / float(total), 1), ), file=sys.stderr, )