def action(args): engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3) tax = Taxonomy(engine, schema=args.schema) writer = csv.writer(args.outfile) for row in tax._get_lineage_table(args.tax_ids): writer.writerow(row) engine.dispose()
def action(args): log.info('reading tax_ids') if args.tax_ids: tax_ids = set(args.tax_ids) elif args.tax_id_file: tax_ids = set(args.tax_id_file.read().split()) elif args.seq_info: tax_ids = {row['tax_id'] for row in csv.DictReader(args.seq_info)} else: sys.exit('Error: no tax_ids were specified') engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3) tax = Taxonomy(engine, schema=args.schema) rows = tax._get_lineage_table(tax_ids) log.info('grouping lineages') all_ranks = set() taxtable = {} for tax_id, grp in groupby(rows, lambda row: row[0]): ranks, tax_rows = as_taxtable_rows(grp, seen=taxtable) taxtable.update(dict(tax_rows)) all_ranks |= set(ranks) # guppy requires that tax_id == parent_id for the root node; # identify the root node by calculating an arbitrary lineage. root_id = tax.lineage(tax_id)['root'] taxtable[root_id]['parent_id'] = root_id sorted_ranks = sorted(all_ranks, key=order_ranks(tax.ranks[::-1])) # guppy requires this column order fieldnames = ['tax_id', 'parent_id', 'rank', 'tax_name'] + sorted_ranks output = list(taxtable.values()) log.info('sorting lineages') output = sorted( output, # key=getitems(*sorted_ranks) key=lambda row: tuple(row.get(rank) or '' for rank in sorted_ranks) ) log.info('writing taxtable') writer = csv.DictWriter( args.outfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) writer.writeheader() writer.writerows(output)
def action(args): engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 2) tax = Taxonomy(engine, schema=args.schema) with engine.connect() as con: # TODO: need to order nodes so that parents are always created first cmd = """ select nodes.*, source.name as source_name from {nodes} join {source} on nodes.source_id = source.id where source.name = {x} """.format(x=tax.placeholder, nodes=tax.nodes, source=tax.source) result = con.execute(cmd, (args.source_name,)) keys = list(result.keys()) nodes = [clean_dict(keys, vals) for vals in result.fetchall()] # get the complete lineage for each node, and provide an # ordering for all nodes so that children may be placed after # parents. tax_ids = list(map(itemgetter('tax_id'), nodes)) lineages = tax._get_lineage_table(tax_ids) ordering = {} for i, lineage in enumerate(lineages): tax_id = lineage[1] if tax_id not in ordering: ordering[tax_id] = i nodes = sorted(nodes, key=lambda n: ordering[n['tax_id']]) cmd = """ select names.*, source.name as source_name from {names} join {source} on names.source_id = source.id where source.name = {x} """.format(x=tax.placeholder, names=tax.names, source=tax.source) result = con.execute(cmd, (args.source_name,)) keys = list(result.keys()) names = [clean_dict(keys, vals) for vals in result.fetchall()] namedict = {key: list(grp) for key, grp in groupby(names, itemgetter('tax_id'))} for node in nodes: node['type'] = 'node' tax_id = node['tax_id'] if tax_id in namedict: node['names'] = namedict.pop(tax_id) yaml.safe_dump_all(nodes, args.outfile, default_flow_style=False, explicit_start=True, indent=2) # prepare remaining names remaining_names = [] for tax_id, names in list(namedict.items()): for name in names: del name['tax_id'] remaining_names.append({ 'tax_id': tax_id, 'type': 'name', 'names': names }) yaml.safe_dump_all(remaining_names, args.outfile, default_flow_style=False, explicit_start=True, indent=2)
def action(args): engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 2) tax = Taxonomy(engine, schema=args.schema) with engine.connect() as con: # TODO: need to order nodes so that parents are always created first cmd = """ select nodes.*, source.name as source_name from {nodes} join {source} on nodes.source_id = source.id where source.name = {x} """.format(x=tax.placeholder, nodes=tax.nodes, source=tax.source) result = con.execute(cmd, (args.source_name, )) keys = list(result.keys()) nodes = [clean_dict(keys, vals) for vals in result.fetchall()] # get the complete lineage for each node, and provide an # ordering for all nodes so that children may be placed after # parents. tax_ids = list(map(itemgetter('tax_id'), nodes)) lineages = tax._get_lineage_table(tax_ids) ordering = {} for i, lineage in enumerate(lineages): tax_id = lineage[1] if tax_id not in ordering: ordering[tax_id] = i nodes = sorted(nodes, key=lambda n: ordering[n['tax_id']]) cmd = """ select names.*, source.name as source_name from {names} join {source} on names.source_id = source.id where source.name = {x} """.format(x=tax.placeholder, names=tax.names, source=tax.source) result = con.execute(cmd, (args.source_name, )) keys = list(result.keys()) names = [clean_dict(keys, vals) for vals in result.fetchall()] namedict = { key: list(grp) for key, grp in groupby(names, itemgetter('tax_id')) } for node in nodes: node['type'] = 'node' tax_id = node['tax_id'] if tax_id in namedict: node['names'] = namedict.pop(tax_id) yaml.safe_dump_all(nodes, args.outfile, default_flow_style=False, explicit_start=True, indent=2) # prepare remaining names remaining_names = [] for tax_id, names in list(namedict.items()): for name in names: del name['tax_id'] remaining_names.append({ 'tax_id': tax_id, 'type': 'name', 'names': names }) yaml.safe_dump_all(remaining_names, args.outfile, default_flow_style=False, explicit_start=True, indent=2)