def test_mh_similarity_downsample_errors(track_abundance): # test downsample=False (default) argument to MinHash.similarity # max_hash = 50 a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance) # max_hash = 100 b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance) a_values = {1: 5, 3: 3, 5: 2, 8: 2} b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) b.set_abundances(b_values) else: a.add_many(a_values.keys()) b.add_many(b_values.keys()) # error, incompatible max hash with pytest.raises(ValueError) as e: a.similarity(b, ignore_abundance=True) # downsample=False assert 'mismatch in scaled; comparison fail' in str(e.value) with pytest.raises(ValueError) as e: a.similarity(b, ignore_abundance=False) # downsample=False assert 'mismatch in scaled; comparison fail' in str(e.value) with pytest.raises(ValueError) as e: b.similarity(a, ignore_abundance=True) # downsample=False assert 'mismatch in scaled; comparison fail' in str(e.value) with pytest.raises(ValueError) as e: b.similarity(a, ignore_abundance=False) # downsample=false assert 'mismatch in scaled; comparison fail' in str(e.value)
def test_mh_similarity_downsample_true(track_abundance): # verify sim(a, b) == sim(b, a), with and without ignore_abundance # max_hash = 50 a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance) # max_hash = 100 b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance) a_values = {1: 5, 3: 3, 5: 2, 8: 2} b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) b.set_abundances(b_values) else: a.add_many(a_values.keys()) b.add_many(b_values.keys()) # downsample=True => no error; values should match either way x = a.similarity(b, ignore_abundance=True, downsample=True) y = b.similarity(a, ignore_abundance=True, downsample=True) assert x == y # downsample=True => no error; values should match either way x = a.similarity(b, ignore_abundance=False, downsample=True) y = b.similarity(a, ignore_abundance=False, downsample=True) assert x == y
def test_mh_jaccard_similarity(): # check actual Jaccard value for a non-trivial case a = MinHash(0, 20, max_hash=50, track_abundance=False) b = MinHash(0, 20, max_hash=50, track_abundance=False) a.add_many([1, 3, 5, 8]) b.add_many([1, 3, 5, 6, 8, 10]) assert a.similarity(b) == 4. / 6.
def test_mh_similarity_downsample_jaccard_value(): # check jaccard value after downsampling # max_hash = 50 a = MinHash(0, 20, max_hash=50, track_abundance=False) # max_hash = 100 b = MinHash(0, 20, max_hash=100, track_abundance=False) a.add_many([1, 3, 5, 8, 70]) b.add_many([1, 3, 5, 6, 8, 10, 70]) # the hash=70 will be truncated by downsampling assert a.similarity(b, downsample=True) == 4. / 6.
def test_add_many(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) a.add_many(list(range(0, 100, 2))) a.add_many(list(range(0, 100, 2))) assert len(a) == 50 assert all(c % 2 == 0 for c in a.get_mins()) for h in range(0, 100, 2): b.add_hash(h) b.add_hash(h) assert len(b) == 50 assert a == b
def test_remove_many(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) a.add_many(list(range(0, 100, 2))) orig_sig = signature.SourmashSignature(a) orig_md5 = orig_sig.md5sum() a.remove_many(list(range(0, 100, 3))) new_sig = signature.SourmashSignature(a) new_md5 = new_sig.md5sum() assert orig_md5 == "f1cc295157374f5c07cfca5f867188a1" assert new_md5 == "dd93fa319ef57f4a019c59ee1a8c73e2" assert orig_md5 != new_md5 assert len(a) == 33 assert all(c % 6 != 0 for c in a.get_mins())