def test_jaccard_on_real_data(): from sourmash_lib.signature import load_signatures afile = 'n10000/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash bfile = 'n10000/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz' b = utils.get_test_data(bfile) sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash assert mh1.compare(mh2) == 0.0183 assert mh2.compare(mh1) == 0.0183 mh1 = mh1.downsample_n(1000) mh2 = mh2.downsample_n(1000) assert mh1.compare(mh2) == 0.011 assert mh2.compare(mh1) == 0.011 mh1 = mh1.downsample_n(100) mh2 = mh2.downsample_n(100) assert mh1.compare(mh2) == 0.01 assert mh2.compare(mh1) == 0.01 mh1 = mh1.downsample_n(10) mh2 = mh2.downsample_n(10) assert mh1.compare(mh2) == 0.0 assert mh2.compare(mh1) == 0.0
def test_scaled_on_real_data_2(): from sourmash_lib.signature import load_signatures afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash bfile = 'scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz' b = utils.get_test_data(bfile) sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash assert round(mh1.compare(mh2), 5) == 0.01644 assert round(mh2.compare(mh1), 5) == 0.01644 mh1 = mh1.downsample_scaled(1000) mh2 = mh2.downsample_scaled(1000) assert round(mh1.compare(mh2), 4) == 0.0187 assert round(mh2.compare(mh1), 4) == 0.0187 mh1 = mh1.downsample_scaled(10000) mh2 = mh2.downsample_scaled(10000) assert round(mh1.compare(mh2), 3) == 0.01 assert round(mh2.compare(mh1), 3) == 0.01 mh1 = mh1.downsample_scaled(100000) mh2 = mh2.downsample_scaled(100000) assert round(mh1.compare(mh2), 2) == 0.01 assert round(mh2.compare(mh1), 2) == 0.01
def search(self, args): "Search a query sig against one or more signatures; report top match." parser = argparse.ArgumentParser() parser.add_argument('query') parser.add_argument('against', nargs='+') parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('-k', '--ksize', default=DEFAULT_K, type=int) parser.add_argument('-f', '--force', action='store_true') args = parser.parse_args(args) # get the query signature sl = sig.load_signatures(open(args.query, 'r'), select_ksize=args.ksize) if len(sl) != 1: raise Exception("%d query signatures; need exactly one" % len(sl)) query = sl[0] # get the signatures to query print('loading db of signatures from %d files' % len(args.against), file=sys.stderr) against = [] for filename in args.against: if filename == args.query and not args.force: print('excluding query from database (file %s)' % filename, file=sys.stderr) continue sl = sig.load_signatures(open(filename, 'r'), select_ksize=args.ksize) for x in sl: against.append((x, filename)) # compute query x db distances = [] for (x, filename) in against: distance = query.similarity(x) if distance >= args.threshold: distances.append((distance, x, filename)) # any matches? sort, show. if distances: distances.sort(reverse=True, key=lambda x: x[0]) print('%d matches:' % len(distances)) for distance, match, filename in distances[:3]: print('\t', match.name(), '\t', "%.3f" % distance, '\t', filename) else: print('** no matches in %d signatures' % len(against), file=sys.stderr)
def load(info, dirname): from sourmash_lib import signature filename = os.path.join(dirname, info['filename']) it = signature.load_signatures(filename) data, = list(it) # should only be one signature return SigLeaf(info['metadata'], data, name=info['name'])
def test_binary_nary_tree(SBTImplementation): factory = GraphFactory(31, 1e5, 4) trees = {} trees[2] = SBTImplementation(factory) trees[5] = SBTImplementation(factory, d=5) trees[10] = SBTImplementation(factory, d=10) for f in glob("urchin/lividus*.sig"): with open(f, 'r') as data: sig = signature.load_signatures(data) leaf = SigLeaf(os.path.basename(f), sig[0]) for tree in trees.values(): tree.add_node(leaf) to_search = leaf results = {} print('*' * 60) print("{}:".format(to_search.metadata)) for d, tree in trees.items(): results[d] = [ str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) ] print(*results[2], sep='\n') assert set(results[2]) == set(results[5]) assert set(results[5]) == set(results[10]) assert len(results) > 0
def test_tree_save_load(SBTImplementation): factory = GraphFactory(31, 1e5, 4) tree = SBTImplementation(factory) for f in glob("urchin/lividus*.sig"): with open(f, 'r') as data: sig = signature.load_signatures(data) leaf = SigLeaf(os.path.basename(f), sig[0]) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = [ str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) ] print(*old_result, sep='\n') tree.save('urchin') tree2 = SBTImplementation.load('urchin.sbt.json', leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = [ str(s) for s in tree2.find(search_minhashes, to_search.data, 0.1) ] print(*new_result, sep='\n') assert set(old_result) == set(new_result) assert len(old_result) > 0
def test_tree_save_load(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo')) tree = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_binary_nary_tree(): factory = GraphFactory(31, 1e5, 4) trees = {} trees[2] = SBT(factory) trees[5] = SBT(factory, d=5) trees[10] = SBT(factory, d=10) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): tree.add_node(leaf) to_search = leaf n_leaves += 1 assert all([len(t.leaves()) == n_leaves for t in trees.values()]) results = {} print('*' * 60) print("{}:".format(to_search.metadata)) for d, tree in trees.items(): results[d] = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*results[2], sep='\n') assert results[2] == results[5] assert results[5] == results[10]
def test_sbt_fsstorage(): factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*old_result, sep='\n') with FSStorage(os.path.join(location, '.fstree')) as storage: tree.save(os.path.join(location, 'tree'), storage=storage) tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*new_result, sep='\n') assert old_result == new_result assert os.path.exists(os.path.join(location, tree.storage.path)) assert os.path.exists(os.path.join(location, '.fstree'))
def test_sbt_combine(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) tree_1 = SBT(factory, d=n_children) tree_2 = SBT(factory, d=n_children) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) if n_leaves < 4: tree_1.add_node(leaf) else: tree_2.add_node(leaf) n_leaves += 1 tree_1.combine(tree_2) t1_leaves = {str(l) for l in tree_1.leaves()} t_leaves = {str(l) for l in tree.leaves()} assert len(t1_leaves) == n_leaves assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves to_search = next(signature.load_signatures( utils.get_test_data(utils.SIG_FILES[0]))) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} tree_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)} assert t1_result == tree_result # TODO: save and load both trees # check if adding a new node will use the next empty position next_empty = 0 for n, d in tree_1.nodes.items(): if d is None: next_empty = n break if not next_empty: next_empty = n + 1 tree_1.add_node(leaf) assert tree_1.max_node == next_empty
def test_do_sourmash_check_protein_comparisons(): # this test checks 2 x 2 protein comparisons with E. coli genes. with utils.TempDirectory() as location: testdata1 = utils.get_test_data('ecoli.faa') status, out, err = utils.runscript('sourmash', [ 'compute', '-k', '21', '--input-is-protein', '--singleton', testdata1 ], in_directory=location) sig1 = os.path.join(location, 'ecoli.faa.sig') assert os.path.exists(sig1) testdata2 = utils.get_test_data('ecoli.genes.fna') status, out, err = utils.runscript('sourmash', [ 'compute', '-k', '21', '--protein', '--no-dna', '--singleton', testdata2 ], in_directory=location) sig2 = os.path.join(location, 'ecoli.genes.fna.sig') assert os.path.exists(sig2) # I'm not sure why load_signatures is randomizing order, but ok. x = list(signature.load_signatures(sig1)) sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name()) x = list(signature.load_signatures(sig2)) sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name()) name1 = sig1_aa.name().split()[0] assert name1 == 'NP_414543.1' name2 = sig2_aa.name().split()[0] assert name2 == 'NP_414544.1' name3 = sig1_trans.name().split()[0] assert name3 == 'gi|556503834:2801-3733' name4 = sig2_trans.name().split()[0] assert name4 == 'gi|556503834:337-2799' print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3)) print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3)) print(name1, name4, round(sig1_aa.similarity(sig2_trans), 3)) print(name2, name4, round(sig2_aa.similarity(sig2_trans), 3)) assert round(sig1_aa.similarity(sig1_trans), 3) == 0.0 assert round(sig2_aa.similarity(sig1_trans), 3) == 0.273 assert round(sig1_aa.similarity(sig2_trans), 3) == 0.174 assert round(sig2_aa.similarity(sig2_trans), 3) == 0.0
def test_load_minified(track_abundance): sigfile = utils.get_test_data('genome-s10+s11.sig') sigs = load_signatures(sigfile) minified = save_signatures(sigs) with open(sigfile, 'r') as f: orig_file = f.read() assert len(minified) < len(orig_file) assert '\n' not in minified
def compare(self, args): "Compare multiple signature files and create a distance matrix." import numpy parser = argparse.ArgumentParser() parser.add_argument('signatures', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('-o', '--output') args = parser.parse_args(args) # load in the various signatures siglist = [] for filename in args.signatures: print('loading', filename, file=sys.stderr) data = open(filename).read() loaded = sig.load_signatures(data, select_ksize=args.ksize) if not loaded: print('warning: no signatures loaded at given ksize from %s' % filename, file=sys.stderr) siglist.extend(loaded) if len(siglist) == 0: print('no signatures!', file=sys.stderr) sys.exit(-1) # build the distance matrix D = numpy.zeros([len(siglist), len(siglist)]) numpy.set_printoptions(precision=3, suppress=True) # do all-by-all calculation i = 0 labeltext = [] for i, E in enumerate(siglist): for j, E2 in enumerate(siglist): D[i][j] = E.similarity(E2) print('%d-%20s\t%s' % ( i, E.name(), D[i, :, ], )) labeltext.append(E.name()) i += 1 print('min similarity in matrix:', numpy.min(D), file=sys.stderr) # shall we output a matrix? if args.output: labeloutname = args.output + '.labels.txt' print('saving labels to:', labeloutname, file=sys.stderr) with open(labeloutname, 'w') as fp: fp.write("\n".join(labeltext)) print('saving distance matrix to:', args.output, file=sys.stderr) with open(args.output, 'wb') as fp: numpy.save(fp, D)
def test_do_sourmash_check_knowngood_dna_comparisons(): # this test checks against a known good signature calculated # by utils/compute-dna-mh-another-way.py with utils.TempDirectory() as location: testdata1 = utils.get_test_data('ecoli.genes.fna') status, out, err = utils.runscript( 'sourmash', ['compute', '-k', '21', '--singleton', '--dna', testdata1], in_directory=location) sig1 = os.path.join(location, 'ecoli.genes.fna.sig') assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1, sig2 = sorted(x, key=lambda x: x.name()) knowngood = utils.get_test_data('benchmark.dna.sig') good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0
def test_roundtrip(track_abundance): e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_roundtrip(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature('*****@*****.**', e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.estimator assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_do_sourmash_compute(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') status, out, err = utils.runscript('sourmash', ['compute', testdata1], in_directory=location) sigfile = os.path.join(location, 'short.fa.sig') assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) assert sig.name().endswith('short.fa')
def test_roundtrip_empty(track_abundance): # edge case, but: empty estimator? :) e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature('*****@*****.**', e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.estimator assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0
def sbt_index(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('signatures', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--traverse-directory', action='store_true') parser.add_argument('-x', '--bf-size', type=float, default=1e5) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = 'protein' else: args.dna = True moltype = 'dna' factory = GraphFactory(1, args.bf_size, 4) tree = SBT(factory) inp_files = list(args.signatures) if args.traverse_directory: inp_files = [] for dirname in args.signatures: for root, dirs, files in os.walk(dirname): for name in files: if name.endswith('.sig'): fullname = os.path.join(root, name) inp_files.append(fullname) print('loading {} files into SBT'.format(len(inp_files))) n = 0 for f in inp_files: s = sig.load_signatures(f, select_ksize=args.ksize, select_moltype=moltype) for ss in s: leaf = SigLeaf(ss.md5sum(), ss) tree.add_node(leaf) n += 1 print('loaded {} sigs; saving SBT under "{}".'.format(n, args.sbt_name)) tree.save(args.sbt_name)
def test_roundtrip_empty(track_abundance): # edge case, but: empty minhash? :) e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0
def test_do_sourmash_compute_multik(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') status, out, err = utils.runscript( 'sourmash', ['compute', '-k', '21,31', testdata1], in_directory=location) outfile = os.path.join(location, 'short.fa.sig') assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 ksizes = set([x.estimator.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes
def test_save_minified(track_abundance): e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1, name="foo") e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2, name="bar baz") x = save_signatures([sig1, sig2]) assert '\n' not in x assert len(x.split('\n')) == 1 y = list(load_signatures(x)) assert len(y) == 2 assert any(sig.name() == 'foo' for sig in y) assert any(sig.name() == 'bar baz' for sig in y)
def main(): p = argparse.ArgumentParser() p.add_argument('sigfile') p.add_argument('--scaled', default=10000, type=int) args = p.parse_args() sigs = list(signature.load_signatures(args.sigfile)) print('loaded {} signatures'.format(len(sigs)), file=sys.stderr) dsout = [] for sig in sigs: sig.minhash = sig.minhash.downsample_scaled(args.scaled) dsout.append(sig) signature.save_signatures(dsout, sys.stdout)
def test_roundtrip_max_hash(track_abundance): e = sourmash_lib.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=10) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert e.max_hash == e2.max_hash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_roundtrip_seed(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance, seed=10) e.mh.add_hash(5) sig = SourmashSignature('*****@*****.**', e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.estimator assert e.seed == e2.seed assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_save_load_multisig(track_abundance): e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2) x = save_signatures([sig1, sig2]) y = list(load_signatures(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def test_do_sourmash_compute_with_seed(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') outfile = os.path.join(location, 'FOO.xxx') status, out, err = utils.runscript('sourmash', [ 'compute', '-k', '21,31', '--seed', '43', testdata1, '-o', outfile ], in_directory=location) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 seeds = [x.estimator.seed for x in siglist] assert len(seeds) == 2 assert set(seeds) == set([43])
def test_do_sourmash_compute_multik_with_protein(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') status, out, err = utils.runscript( 'sourmash', ['compute', '-k', '21,30', '--protein', testdata1], in_directory=location) outfile = os.path.join(location, 'short.fa.sig') assert os.path.exists(outfile) with open(outfile, 'rt') as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 4 ksizes = set([x.estimator.ksize for x in siglist]) assert 21 in ksizes assert 30 in ksizes
def dump(self, args): parser = argparse.ArgumentParser() parser.add_argument('filenames', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) args = parser.parse_args(sys.argv[2:]) for filename in args.filenames: data = open(filename).read() print('loading', filename) siglist = sig.load_signatures(data, select_ksize=args.ksize) assert len(siglist) == 1 s = siglist[0] fp = open(filename + '.dump.txt', 'w') fp.write(" ".join((map(str, s.estimator.mh.get_mins())))) fp.close()
def dump(self, args): parser = argparse.ArgumentParser() parser.add_argument('filenames', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K, help='k-mer size (default: %(default)i)') args = parser.parse_args(args) for filename in args.filenames: print('loading', filename) siglist = sig.load_signatures(filename, select_ksize=args.ksize) siglist = list(siglist) assert len(siglist) == 1 s = siglist[0] fp = open(filename + '.dump.txt', 'w') fp.write(" ".join((map(str, s.estimator.mh.get_mins())))) fp.close()