def load_taxonomy_assignments(filename, delimiter=',', start_column=2, use_headers=True, force=False): # parse spreadsheet! fp = open(filename, 'rtU') r = csv.reader(fp, delimiter=delimiter) row_headers = ['identifiers'] row_headers += ['_skip_'] * (start_column - 2) row_headers += list(lca_utils.taxlist()) # first check that headers are interpretable. if use_headers: notify('examining spreadsheet headers...') first_row = next(iter(r)) n_disagree = 0 for (column, value) in zip(row_headers, first_row): if column == '_skip_': continue if column.lower() != value.lower(): notify("** assuming column '{}' is {} in spreadsheet", value, column) n_disagree += 1 if n_disagree > 2: error('whoa, too many assumptions. are the headers right?') error('expecting {}', ",".join(row_headers)) if not force: sys.exit(-1) notify('...continue, because --force was specified.') # convert into a lineage pair assignments = {} num_rows = 0 for row in r: if row and row[0].strip(): # want non-empty row num_rows += 1 lineage = list(zip(row_headers, row)) lineage = [x for x in lineage if x[0] != '_skip_'] ident = lineage[0][1] lineage = lineage[1:] # clean lineage of null names, replace with 'unassigned' lineage = [(a, lca_utils.filter_null(b)) for (a, b) in lineage] lineage = [LineagePair(a, b) for (a, b) in lineage] # remove end nulls while lineage and lineage[-1].name == 'unassigned': lineage = lineage[:-1] # store lineage tuple if lineage: assignments[ident] = tuple(lineage) fp.close() return assignments, num_rows
def main(sysv_args): set_quiet(False) commands = { 'classify': classify, 'index': index, 'summarize': summarize_main, 'rankinfo': rankinfo_main, 'gather': gather_main, 'compare_csv': compare_csv } parser = argparse.ArgumentParser( description='lowest-common ancestor (LCA) utilities', usage=usage) parser.add_argument('lca_command', nargs='?') args = parser.parse_args(sysv_args[0:1]) if not args.lca_command: print(usage) sys.exit(1) if args.lca_command not in commands: error('Unrecognized command: {}', args.lca_command) parser.print_help() sys.exit(1) cmd = commands.get(args.lca_command) cmd(sysv_args[1:])
def rankinfo_main(args): """ rankinfo! """ p = argparse.ArgumentParser(prog="sourmash lca rankinfo") p.add_argument('db', nargs='+') p.add_argument('--scaled', type=float) p.add_argument('-d', '--debug', action='store_true') args = p.parse_args(args) if not args.db: error('Error! must specify at least one LCA database with --db') sys.exit(-1) if args.debug: set_debug(args.debug) if args.scaled: args.scaled = int(args.scaled) # load all the databases dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled) # count all the LCAs across these databases counts = make_lca_counts(dblist) # collect counts across all ranks counts_by_rank = defaultdict(int) for lineage, count in counts.items(): if lineage: lineage_tup = lineage[-1] counts_by_rank[lineage_tup.rank] += count # output! total = float(sum(counts_by_rank.values())) for rank in lca_utils.taxlist(): count = counts_by_rank.get(rank, 0) print('{}: {} ({:.1f}%)'.format(rank, count, count / total * 100.))
def main(sysv_args): set_quiet(False) commands = { 'classify': classify, 'index': index, 'summarize': summarize_main, 'rankinfo': rankinfo_main, 'compare_csv': compare_csv } parser = argparse.ArgumentParser( description='lowest-common ancestor (LCA) utilities', usage='''sourmash lca <command> [<args>] Commands can be: index <taxonomy.csv> <output_db name> <signature [...]> - create LCA database classify --db <db_name [...]> --query <signature [...]> - classify genomes summarize --db <db_name [...]> --query <signature [...]> - summarize mixture rankinfo <db_name [...]> - database rank info compare_csv <csv1> <csv2> - compare spreadsheets Use '-h' to get subcommand-specific help, e.g. sourmash lca index -h . ''') parser.add_argument('lca_command') args = parser.parse_args(sysv_args[0:1]) if args.lca_command not in commands: error('Unrecognized command: {}', args.lca_command) parser.print_help() sys.exit(1) cmd = commands.get(args.lca_command) cmd(sysv_args[1:])
def classify(args): """ main single-genome classification function. """ p = argparse.ArgumentParser() p.add_argument('--db', nargs='+', action='append') p.add_argument('--query', nargs='+', action='append') p.add_argument('--threshold', type=int, default=DEFAULT_THRESHOLD) p.add_argument('-o', '--output', type=argparse.FileType('wt'), help='output CSV to this file instead of stdout') p.add_argument('--scaled', type=float) p.add_argument('--traverse-directory', action='store_true', help='load all signatures underneath directories.') p.add_argument('-d', '--debug', action='store_true') args = p.parse_args(args) if not args.db: error('Error! must specify at least one LCA database with --db') sys.exit(-1) if not args.query: error('Error! must specify at least one query signature with --query') sys.exit(-1) if args.debug: set_debug(args.debug) # flatten --db and --query args.db = [item for sublist in args.db for item in sublist] args.query = [item for sublist in args.query for item in sublist] # load all the databases dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled) notify('ksize={} scaled={}', ksize, scaled) # find all the queries notify('finding query signatures...') if args.traverse_directory: inp_files = list(sourmash_args.traverse_find_sigs(args.query)) else: inp_files = list(args.query) # set up output csvfp = csv.writer(sys.stdout) if args.output: notify("outputting classifications to '{}'", args.output.name) csvfp = csv.writer(args.output) else: notify("outputting classifications to stdout") csvfp.writerow(['ID', 'status'] + list(lca_utils.taxlist())) # for each query, gather all the matches across databases total_count = 0 n = 0 total_n = len(inp_files) for query_filename in inp_files: n += 1 for query_sig in sourmash_lib.load_signatures(query_filename, ksize=ksize): notify(u'\r\033[K', end=u'') notify('... classifying {} (file {} of {})', query_sig.name(), n, total_n, end='\r') debug('classifying', query_sig.name()) total_count += 1 # make sure we're looking at the same scaled value as database query_sig.minhash = query_sig.minhash.downsample_scaled(scaled) # do the classification lineage, status = classify_signature(query_sig, dblist, args.threshold) debug(lineage) # output each classification to the spreadsheet row = [query_sig.name(), status] row += lca_utils.zip_lineage(lineage) # when outputting to stdout, make output intelligible if not args.output: notify(u'\r\033[K', end=u'') csvfp.writerow(row) notify(u'\r\033[K', end=u'') notify('classified {} signatures total', total_count)
def index(args): """ main function for building an LCA database. """ p = argparse.ArgumentParser() p.add_argument('csv', help='taxonomy spreadsheet') p.add_argument('lca_db_out', help='name to save database to') p.add_argument('signatures', nargs='+', help='one or more sourmash signatures') p.add_argument('--scaled', default=10000, type=float) p.add_argument('-k', '--ksize', default=31, type=int) p.add_argument('-d', '--debug', action='store_true') p.add_argument('-C', '--start-column', default=2, type=int, help='column at which taxonomic assignments start') p.add_argument('--tabs', action='store_true', help='input spreadsheet is tab-delimited (default: commas)') p.add_argument('--no-headers', action='store_true', help='no headers present in taxonomy spreadsheet') p.add_argument('--split-identifiers', action='store_true', help='split names in signatures on whitspace and period') p.add_argument('-f', '--force', action='store_true') p.add_argument('--traverse-directory', action='store_true', help='load all signatures underneath directories.') p.add_argument('--report', help='output a report on anomalies, if any.') args = p.parse_args(args) if args.start_column < 2: error('error, --start-column cannot be less than 2') sys.exit(-1) if args.debug: set_debug(args.debug) args.scaled = int(args.scaled) # first, load taxonomy spreadsheet delimiter = ',' if args.tabs: delimiter = '\t' assignments, num_rows = load_taxonomy_assignments(args.csv, delimiter=delimiter, start_column=args.start_column, use_headers=not args.no_headers, force=args.force) # convert lineages to numbers. next_lineage_index = 0 lineage_dict = {} assignments_idx = {} lineage_to_idx = {} for (ident, lineage) in assignments.items(): idx = lineage_to_idx.get(lineage) if idx is None: idx = next_lineage_index next_lineage_index += 1 lineage_dict[idx] = lineage lineage_to_idx[lineage] = idx assignments_idx[ident] = idx notify('{} distinct lineages in spreadsheet out of {} rows', len(lineage_dict), num_rows) # load signatures, construct index of hashvals to lineages hashval_to_lineage = defaultdict(set) md5_to_lineage = {} notify('finding signatures...') if args.traverse_directory: yield_all_files = False # only pick up *.sig files? if args.force: yield_all_files = True inp_files = list(sourmash_args.traverse_find_sigs(args.signatures, yield_all_files=yield_all_files)) else: inp_files = list(args.signatures) n = 0 total_n = len(inp_files) record_duplicates = set() record_no_lineage = set() record_remnants = set(assignments_idx.keys()) for filename in inp_files: n += 1 for sig in sourmash_lib.load_signatures(filename, ksize=args.ksize): notify(u'\r\033[K', end=u'') notify('... loading signature {} (file {} of {})', sig.name()[:30], n, total_n, end='\r') debug(filename, sig.name()) if sig.md5sum() in md5_to_lineage: notify('\nWARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum()) record_duplicates.add(filename) continue name = sig.name() if args.split_identifiers: # hack for NCBI-style names, etc. name = name.split(' ')[0].split('.')[0] # is this one for which we have a lineage assigned? lineage_idx = assignments_idx.get(name) if lineage_idx is None: notify('\nWARNING: no lineage assignment for {}.', name) record_no_lineage.add(name) else: # remove from our list of remnant lineages record_remnants.remove(name) # downsample to specified scaled; this has the side effect of # making sure they're all at the same scaled value! minhash = sig.minhash.downsample_scaled(args.scaled) # connect hashvals to lineage for hashval in minhash.get_mins(): hashval_to_lineage[hashval].add(lineage_idx) # store md5 -> lineage too md5_to_lineage[sig.md5sum()] = lineage_idx notify(u'\r\033[K', end=u'') notify('...found {} genomes with lineage assignments!!', len(md5_to_lineage)) # remove those lineages with no genomes associated assigned_lineages = set(md5_to_lineage.values()) lineage_dict_2 = {} for idx in assigned_lineages: lineage_dict_2[idx] = lineage_dict[idx] unused_lineages = set(lineage_dict.values()) - set(lineage_dict_2.values()) notify('{} assigned lineages out of {} distinct lineages in spreadsheet', len(lineage_dict_2), len(lineage_dict)) lineage_dict = lineage_dict_2 # now, save! db_outfile = args.lca_db_out if not (db_outfile.endswith('.lca.json') or db_outfile.endswith('.lca.json.gz')): db_outfile += '.lca.json' notify('saving to LCA DB: {}'.format(db_outfile)) db = lca_utils.LCA_Database() db.lineage_dict = lineage_dict db.hashval_to_lineage_id = hashval_to_lineage db.ksize = int(args.ksize) db.scaled = int(args.scaled) db.signatures_to_lineage = md5_to_lineage db.save(db_outfile) if record_duplicates or record_no_lineage or record_remnants or unused_lineages: if record_duplicates: notify('WARNING: {} duplicate signatures.', len(record_duplicates)) if record_no_lineage: notify('WARNING: no lineage provided for {} signatures.', len(record_no_lineage)) if record_remnants: notify('WARNING: no signatures for {} lineage assignments.', len(record_remnants)) if unused_lineages: notify('WARNING: {} unused lineages.', len(unused_lineages)) if args.report: notify("generating a report and saving in '{}'", args.report) generate_report(record_duplicates, record_no_lineage, record_remnants, unused_lineages, args.report) else: notify('(You can use --report to generate a detailed report.)')
def summarize_main(args): """ main summarization function. """ p = argparse.ArgumentParser() p.add_argument('--db', nargs='+', action='append') p.add_argument('--query', nargs='+', action='append') p.add_argument('--threshold', type=int, default=DEFAULT_THRESHOLD) p.add_argument('--traverse-directory', action='store_true', help='load all signatures underneath directories.') p.add_argument('-o', '--output', type=argparse.FileType('wt'), help='CSV output') p.add_argument('--scaled', type=float) p.add_argument('-d', '--debug', action='store_true') args = p.parse_args(args) if not args.db: error('Error! must specify at least one LCA database with --db') sys.exit(-1) if not args.query: error('Error! must specify at least one query signature with --query') sys.exit(-1) if args.debug: set_debug(args.debug) if args.scaled: args.scaled = int(args.scaled) # flatten --db and --query args.db = [item for sublist in args.db for item in sublist] args.query = [item for sublist in args.query for item in sublist] # load all the databases dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled) notify('ksize={} scaled={}', ksize, scaled) # find all the queries notify('finding query signatures...') if args.traverse_directory: inp_files = list(sourmash_args.traverse_find_sigs(args.query)) else: inp_files = list(args.query) # for each query, gather all the hashvals across databases total_count = 0 n = 0 total_n = len(inp_files) hashvals = defaultdict(int) for query_filename in inp_files: n += 1 for query_sig in sourmash_lib.load_signatures(query_filename, ksize=ksize): notify(u'\r\033[K', end=u'') notify('... loading {} (file {} of {})', query_sig.name(), n, total_n, end='\r') total_count += 1 mh = query_sig.minhash.downsample_scaled(scaled) for hashval in mh.get_mins(): hashvals[hashval] += 1 notify(u'\r\033[K', end=u'') notify('loaded {} signatures from {} files total.', total_count, n) # get the full counted list of lineage counts in this signature lineage_counts = summarize(hashvals, dblist, args.threshold) # output! total = float(len(hashvals)) for (lineage, count) in lineage_counts.items(): if lineage: lineage = lca_utils.zip_lineage(lineage, truncate_empty=True) lineage = ';'.join(lineage) else: lineage = '(root)' p = count / total * 100. p = '{:.1f}%'.format(p) print_results('{:5} {:>5} {}'.format(p, count, lineage)) # CSV: if args.output: w = csv.writer(args.output) headers = ['count'] + list(lca_utils.taxlist()) w.writerow(headers) for (lineage, count) in lineage_counts.items(): debug('lineage:', lineage) row = [count] + lca_utils.zip_lineage(lineage) w.writerow(row)
def compare_csv(args): p = argparse.ArgumentParser() p.add_argument('csv1', help='taxonomy spreadsheet output by classify') p.add_argument('csv2', help='custom taxonomy spreadsheet') p.add_argument('-d', '--debug', action='store_true') p.add_argument('-C', '--start-column', default=2, type=int, help='column at which taxonomic assignments start') p.add_argument('--tabs', action='store_true', help='input spreadsheet is tab-delimited (default: commas)') p.add_argument('--no-headers', action='store_true', help='no headers present in taxonomy spreadsheet') p.add_argument('-f', '--force', action='store_true') args = p.parse_args(args) if args.start_column < 2: error('error, --start-column cannot be less than 2') sys.exit(-1) if args.debug: set_debug(args.debug) # first, load classify-style spreadsheet notify('loading classify output from: {}', args.csv1) assignments0, num_rows0 = load_taxonomy_assignments(args.csv1, start_column=3) notify('loaded {} distinct lineages, {} rows', len(set(assignments0.values())), num_rows0) notify('----') # next, load custom taxonomy spreadsheet delimiter = ',' if args.tabs: delimiter = '\t' notify('loading custom spreadsheet from: {}', args.csv2) assignments, num_rows = load_taxonomy_assignments( args.csv2, delimiter=delimiter, start_column=args.start_column, use_headers=not args.no_headers, force=args.force) notify('loaded {} distinct lineages, {} rows', len(set(assignments.values())), num_rows) # now, compute basic differences: missing_1 = set(assignments0.keys()) - set(assignments.keys()) missing_2 = set(assignments.keys()) - set(assignments0.keys()) if missing_2: notify('missing {} assignments in classify spreadsheet.', len(missing_2)) if missing_1: notify('missing {} assignments in custom spreadsheet.', len(missing_1)) if missing_1 or missing_2: notify('(these will not be evaluated any further)') else: notify('note: all IDs are in both spreadsheets!') # next, look at differences in lineages common = set(assignments0.keys()) common.intersection_update(assignments.keys()) n_total = 0 n_different = 0 n_compat = 0 n_incompat = 0 incompat_rank = defaultdict(int) for k in common: n_total += 1 v0 = assignments0[k] v1 = assignments[k] if v0 != v1: n_different += 1 tree = lca_utils.build_tree([v0]) lca_utils.build_tree([v1], tree) lca, reason = lca_utils.find_lca(tree) if reason == 0: # compatible lineages n_compat += 1 print_results("{},compatible,{}", k, ";".join(zip_lineage(lca))) else: n_incompat += 1 print_results("{},incompatible,{}", k, ";".join(zip_lineage(lca))) rank = next(iter(lca_utils.taxlist())) if lca: rank = lca[-1].rank incompat_rank[rank] += 1 notify("{} total assignments, {} differ between spreadsheets.", n_total, n_different) notify("{} are compatible (one lineage is ancestor of another.", n_compat) notify("{} are incompatible (there is a disagreement in the trees).", n_incompat) if n_incompat: for rank in lca_utils.taxlist(): notify('{} incompatible at rank {}', incompat_rank[rank], rank)