def test_roundtrip(track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_compare_ne(track_abundance): # same content, different names -> different e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, name='bar') assert sig1 != sig2
def test_load_one_fail_multisig(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) e2 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig2 = SourmashSignature(e2) x = save_signatures([sig1, sig2]) with pytest.raises(ValueError): y = load_one_signature(x)
def test_compare(track_abundance): # same content, same name -> equal e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, name='foo') assert e == f
def test_roundtrip_empty(track_abundance): # edge case, but: empty minhash? :) e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0
def test_memmap(): e1 = sourmash.MinHash(n=1, ksize=20) sig1 = SourmashSignature(e1) e2 = sourmash.MinHash(n=1, ksize=25) sig2 = SourmashSignature(e2) siglist = [sig1, sig2] memmapped, filename = to_memmap(np.array(siglist)) # Assert that the data didn't change as a result of memory-mapping np.testing.assert_array_equal(memmapped, siglist) assert filename.endswith(".mmap")
def test_compare_ne2_reverse(track_abundance): # same content, one has filename, other does not -> different e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, filename='b') assert sig2 != sig1 assert sig1 != sig2
def test_str(track_abundance): # signatures should be printable e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) print(sig) assert str(sig) == 'SourmashSignature(59502a74)' assert repr(sig) == 'SourmashSignature(59502a74)' sig.d['name'] = 'fizbar' assert str(sig) == 'SourmashSignature(\'fizbar\', 59502a74)' assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'
def test_str(track_abundance): # signatures should be printable e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) print(sig) assert str(sig) == 'SourmashSignature(59502a74)' assert repr(sig) == 'SourmashSignature(59502a74)' sig._name = 'fizbar' assert str(sig) == 'SourmashSignature(\'fizbar\', 59502a74)' assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'
def test_roundtrip_max_hash(track_abundance): e = sourmash.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=10) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert e.max_hash == e2.max_hash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_save_minified(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1, name="foo") e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2, name="bar baz") x = save_signatures([sig1, sig2]) assert '\n' not in x assert len(x.split('\n')) == 1 y = list(load_signatures(x)) assert len(y) == 2 assert any(sig.name() == 'foo' for sig in y) assert any(sig.name() == 'bar baz' for sig in y)
def test_save_load_multisig_json(): e1 = sourmash.MinHash(n=1, ksize=20) sig1 = SourmashSignature(e1) e2 = sourmash.MinHash(n=1, ksize=25) sig2 = SourmashSignature(e2) x = save_signatures_json([sig1, sig2]) y = list(load_signatures_json(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def test_save_load_multisig(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2) x = save_signatures([sig1, sig2]) y = list(load_signatures(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def test_binary_fp(tmpdir, track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) path = tmpdir.join("1.sig") with open(str(path), 'wb') as fp: sig = SourmashSignature(e) s = save_signatures([sig], fp)
def test_load_one_succeed(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) x = save_signatures([sig1]) y = load_one_signature(x) assert sig1 == y
def test_hashable(track_abundance): # check: can we use signatures as keys in dictionaries and sets? e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) x = set() x.add(sig)
def test_load_compressed(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) x = save_signatures([sig1], compression=5) y = load_one_signature(x) assert sig1 == y sigfile = utils.get_test_data('genome-s10+s11.sig.gz') sigs = load_signatures(sigfile)
def test_similarity_downsample(track_abundance): e = sourmash.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**63) f = sourmash.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**2) e.add_hash(1) e.add_hash(5) assert len(e.get_mins()) == 2 f.add_hash(1) f.add_hash(5) # should be discarded due to max_hash assert len(f.get_mins()) == 1 ee = SourmashSignature(e) ff = SourmashSignature(f) with pytest.raises(ValueError): # mismatch in max_hash ee.similarity(ff) x = ee.similarity(ff, downsample=True) assert round(x, 1) == 1.0
def test_md5(track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_hash(5) sig = SourmashSignature(e) print(sig._save()) assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum()
def test_name_3(track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e, name='foo', filename='foo.txt') assert sig.name() == 'foo'
def test_name_4(track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e) assert sig.name() == sig.md5sum()[:8]
def test_md5(track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_hash(5) sig = SourmashSignature(e) assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum()
def sketch(name, sequence, params={'ksize': 21, 'n': 0, 'scaled': 100}): mh = MinHash(**params) mh.add_sequence(sequence, force=True) # "force" will sketch Ns in DNA sequences as well sig = SourmashSignature(mh, name=name) return sig