def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('sigs', nargs='+') parser.add_argument('lca_db') args = parser.parse_args() minhashes = [] for filename in args.sigs: ss = sourmash.load_one_signature(filename) minhashes.append(ss.minhash) # load the LCA database dblist, ksize, scaled = lca_utils.load_databases([args.lca_db], None) db = dblist[0] # double check scaled requirements some_mh = minhashes[0] mh_scaled = some_mh.scaled if scaled >= mh_scaled: print( '** warning: many minhashes will go unclassified because LCA database scaled is {}' .format(scaled), file=sys.stderr) print('** warning: the minhash scaled is {}'.format(mh_scaled), file=sys.stderr) summarize_taxonomic_purity(minhashes, db, verbose=True, filenames=args.sigs)
def test_databases(): filename1 = utils.get_test_data('lca/delmont-1.lca.json') filename2 = utils.get_test_data('lca/delmont-2.lca.json') dblist, ksize, scaled = lca_utils.load_databases([filename1, filename2]) print(dblist) assert len(dblist) == 2 assert ksize == 31 assert scaled == 10000
def main(args): p = argparse.ArgumentParser() p.add_argument('db', nargs='+') p.add_argument('--scaled', type=float) p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-d', '--debug', action='store_true', help='output debugging output') p.add_argument( '--minimum-num', type=int, default=0, help= 'Minimum number of different lineages a k-mer must be in to be counted' ) p.add_argument( '--minimum-hashes', type=int, default=5, help='Minimum number of hashes lineages must share to be reported') p.add_argument('--lowest-rank', default='phylum') p.add_argument('--prefix', default=None, help='prefix for output files') args = p.parse_args(args) if not args.db: error('Error! must specify at least one LCA database with --db') sys.exit(-1) set_quiet(args.quiet, args.debug) if args.scaled: args.scaled = int(args.scaled) # load all the databases print('loading databases:', args.db) dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled) assert len(dblist) == 1 # count all the LCAs across these databases counts, confused_hashvals = make_lca_counts(dblist, lowest_rank=args.lowest_rank, min_num=args.minimum_num, min_hashes=args.minimum_hashes, prefix=args.prefix) with open('confused_hashvals.txt', 'wt') as fp: fp.write("\n".join([str(i) for i in confused_hashvals]))
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('filename') parser.add_argument('lca_db') args = parser.parse_args() # load the minhashes calculated by search.characterize_catlas_regions group_ident = pickle.load(open(args.filename + '.node_mh', 'rb')) # load the LCA database dblist, ksize, scaled = lca_utils.load_databases([args.lca_db], None) db = dblist[0] # double check scaled requirements some_mh = next(iter(group_ident.values())) mh_scaled = some_mh.scaled if scaled > mh_scaled: print('** warning: many minhashes will go unclassified because LCA database scaled is {}'.format(scaled), file=sys.stderr) print('** warning: the minhash scaled is {}'.format(mh_scaled), file=sys.stderr) summarize_taxonomic_purity(group_ident.values(), db, verbose=True)
def main(args): # load all the databases dblist, ksize, scaled = lca_utils.load_databases(args.db, args.ksize, args.scaled) # count all the shared kmers across these databases counts, total_kmer_count = count_shared_kmers(dblist) # write out with open(args.csv, 'wt') as fp: hashes_by_lineage = csv.writer(fp) hashes_by_lineage.writerow( ['rank', 'lineage', 'num_shared_kmers', 'percent_shared_kmers']) for lineage, shared_kmer_count in counts.items(): rank = lineage[-1].rank percent_shared_kmers = float(shared_kmer_count) / total_kmer_count hashes_by_lineage.writerow([ rank, lca_utils.display_lineage(lineage), str(shared_kmer_count), str(round(percent_shared_kmers, 2)) ])