def test_name_3(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature('*****@*****.**', e, name='foo', filename='foo.txt') assert sig.name() == 'foo'
def test_roundtrip(track_abundance): e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_compare(track_abundance): # same content, same name -> equal e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, name='foo') assert e == f
def test_load_one_fail_multisig(track_abundance): e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) e2 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig2 = SourmashSignature(e2) x = save_signatures([sig1, sig2]) with pytest.raises(ValueError): y = load_one_signature(x)
def test_compare_ne(track_abundance): # same content, different names -> different e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, name='bar') assert sig1 != sig2
def test_roundtrip(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature('*****@*****.**', e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.estimator assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_compare_ne2_reverse(track_abundance): # same content, one has filename, other does not -> different e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, filename='b') assert sig2 != sig1 assert sig1 != sig2
def test_roundtrip_empty(track_abundance): # edge case, but: empty estimator? :) e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature('*****@*****.**', e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.estimator assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0
def test_roundtrip_empty(track_abundance): # edge case, but: empty minhash? :) e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0
def test_str(track_abundance): # signatures should be printable e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) print(sig) assert str(sig) == 'SourmashSignature(59502a74)' assert repr(sig) == 'SourmashSignature(59502a74)' sig.d['name'] = 'fizbar' assert str(sig) == 'SourmashSignature(\'fizbar\', 59502a74)' assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'
def test_save_minified(track_abundance): e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1, name="foo") e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2, name="bar baz") x = save_signatures([sig1, sig2]) assert '\n' not in x assert len(x.split('\n')) == 1 y = list(load_signatures(x)) assert len(y) == 2 assert any(sig.name() == 'foo' for sig in y) assert any(sig.name() == 'bar baz' for sig in y)
def test_save_load_multisig(track_abundance): e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2) x = save_signatures([sig1, sig2]) y = list(load_signatures(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def test_roundtrip_seed(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance, seed=10) e.mh.add_hash(5) sig = SourmashSignature('*****@*****.**', e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.estimator assert e.seed == e2.seed assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_roundtrip_max_hash(track_abundance): e = sourmash_lib.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=10) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert e.max_hash == e2.max_hash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_save_load_multisig_json(): e1 = sourmash_lib.MinHash(n=1, ksize=20) sig1 = SourmashSignature('*****@*****.**', e1) e2 = sourmash_lib.MinHash(n=1, ksize=20) sig2 = SourmashSignature('*****@*****.**', e2) x = save_signatures_json([sig1, sig2]) y = list(load_signatures_json(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def test_load_one_succeed(track_abundance): e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) x = save_signatures([sig1]) y = load_one_signature(x) assert sig1 == y
def test_hashable(track_abundance): # check: can we use signatures as keys in dictionaries and sets? e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) x = set() x.add(sig)
def test_save_load_multisig(track_abundance): e1 = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature('*****@*****.**', e1) e2 = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) sig2 = SourmashSignature('*****@*****.**', e2) x = save_signatures([sig1, sig2]) y = list(load_signatures(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def sbt_index(client, db, cell, query, ksize, nsketch, key, file): '''Create a sequence Bloom tree from a cell/ database cursor. 1. select seqs for tree 2. assign common id (field derivative.minhash.sbt.ids) 3. minhash seqs, name == UUID, md5? (think about SBT reuse) 4. query a different collection/ metagenome against this --index {raw, minhash} input: all of cell or cursor \b $ zoo sbt_index --db ref --cell ref --ksize 16 --nsketch 1000 \ reference Initialize SBT. Compute minhash signatures for selected documents. k-mer size: 16, sketch size: 1000 \ 9158 Elapsed Time: 0:01:45 Save SBT. Done. \b $ sourmash sbt_search --ksize 16 reference survey.fa.sig # running sourmash subcommand: sbt_search loaded query: survey.fa... (k=16, DNA) 0.11 0ef85591-d464-4953-915f-f673907b7e8e (Zika reference genome) TODO: add query TODO: --key arg not working? ''' c = MongoClient(client)[db][cell] print('Initialize SBT.') # init SBT factory = GraphFactory(ksize=ksize, starting_size=1e5, n_tables=4) # 4 .. nt? tree = SBT(factory, d=2) # d .. see "n-ary " in notebook print('Compute minhash signatures for selected documents.') print('{}{}{}{}'.format( 'k-mer size: ', ksize, ', sketch size: ', nsketch )) bar = ProgressBar(max_value=UnknownLength) counter = 0 for d in c.find(): counter += 1 e = Estimators(ksize=ksize, n=nsketch) e.add_sequence(d['sequence'], force=True) s = SourmashSignature(email='', estimator=e, name=deep_get(d, key)) leaf = SigLeaf(metadata=deep_get(d, key), data=s) tree.add_node(node=leaf) bar.update(counter) print('\nSave SBT.') tree.save(file) print('Done.')
def test_similarity_downsample(track_abundance): e = sourmash_lib.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**63) f = sourmash_lib.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**2) e.add_hash(1) e.add_hash(5) assert len(e.get_mins()) == 2 f.add_hash(1) f.add_hash(5) # should be discarded due to max_hash assert len(f.get_mins()) == 1 ee = SourmashSignature(e) ff = SourmashSignature(f) with pytest.raises(ValueError): # mismatch in max_hash ee.similarity(ff) x = ee.similarity(ff, downsample=True) assert round(x, 1) == 1.0
def minhash(client, db, cell, query, ksize, nsketch, key, file): '''Minhash a cell/ database cursor. just plain old sigs for collection ''' c = MongoClient(client)[db][cell] bar = ProgressBar(max_value=UnknownLength) counter = 0 l = [] print('Compute minhash signatures for selected documents.') print('{}{}{}{}'.format( 'k-mer size: ', ksize, ', sketch size: ', nsketch )) for d in c.find(): counter += 1 e = Estimators(ksize=ksize, n=nsketch) e.add_sequence(d['sequence'], force=True) s = SourmashSignature(email='', estimator=e, name=deep_get(d, key)) l.append(s) bar.update(counter) print('\nSave signatures.') signature.save_signatures(l, fp=file) print('Done.')
def test_md5(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) e.mh.add_hash(5) sig = SourmashSignature('*****@*****.**', e) print(sig._save()) assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum()
def test_md5(track_abundance): e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_hash(5) sig = SourmashSignature(e) print(sig._save()) assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum()
def test_name_3(track_abundance): e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e, name='foo', filename='foo.txt') assert sig.name() == 'foo'
def test_name_4(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature('*****@*****.**', e) assert sig.name() == sig.md5sum()[:8]
def test_name_4(track_abundance): e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e) assert sig.name() == sig.md5sum()[:8]
N = 1000 # init SBT factory = GraphFactory(ksize=KSIZE, starting_size=1e5, n_tables=4) # 4 .. nt? tree = SBT(factory, d=2) # d .. see "n-ary " in notebook bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength) cursor = db.ref.find() c = 0 for i in cursor: key = deep_get(i, 'metadata.alt_id.gb') seq = i['sequence'] # db.ref.find_one()['sequence'] # 'ACTG...' e = Estimators(ksize=KSIZE, n=N) e.add_sequence(seq, force=True) # e.get_hashes() s = SourmashSignature(email='', estimator=e, name=key) leaf = SigLeaf(metadata=key, data=s) tree.add_node(node=leaf) c += 1 bar.update(c) # \ 9158 Elapsed Time: 0:01:49 # search the last fasta entry against the SBT (">0.95") # filtered = tree.find(search_minhashes, s, 0.1) # matches = [(str(i.metadata), i.data.similarity(s)) for i in filtered] # [('0.95', 1.0)] # fasta header, similarity tree.save('ref') ''' sourmash sbt_search -k 16 ref ~/repos/zoo/zoo/data/zika/survey.sig