def test_size_limit(track_abundance): # test behavior with size limit of 3 mh = MinHash(3, 4, track_abundance=track_abundance) mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) assert mh.get_mins() == [10, 20, 30] mh.add_hash(5) # -> should push 30 off end assert mh.get_mins() == [5, 10, 20]
def test_abundance_simple(): a = MinHash(20, 5, False, track_abundance=True) a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 1} a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}
def test_basic_dna_bad_force(track_abundance): # test behavior on bad DNA; use 100 so multiple hashes get added. mh = MinHash(100, 4, track_abundance=track_abundance) assert len(mh.get_mins()) == 0 mh.add_sequence('ATGN', True) # ambiguous kmer skipped. assert len(mh.get_mins()) == 0 mh.add_sequence('AATGN', True) # but good k-mers still used. assert len(mh.get_mins()) == 1 mh.add_sequence('AATG', True) # checking that right kmer was added assert len(mh.get_mins()) == 1 # (only 1 hash <- this is a dup)
def test_basic_dna_bad_force_2(track_abundance): # test behavior on bad DNA mh = MinHash(100, 4, track_abundance=track_abundance) assert len(mh.get_mins()) == 0 mh.add_sequence('AAGNCGG', True) # ambiguous kmers skipped. assert len(mh.get_mins()) == 0 mh.add_sequence('AATGNGCGG', True) # ambiguous kmers skipped. assert len(mh.get_mins()) == 2 mh.add_sequence('AATG', True) # checking that right kmers were added mh.add_sequence('GCGG', True) assert len(mh.get_mins()) == 2 # (only 2 hashes should be there)
def test_add_hash_with_abundance(): a = MinHash(20, 5, False, track_abundance=True) a.add_hash_with_abundance(10, 1) assert a.get_mins(with_abundance=True) == {10: 1} a.add_hash_with_abundance(20, 2) assert a.get_mins(with_abundance=True) == {10: 1, 20: 2} a.add_hash_with_abundance(10, 2) assert a.get_mins(with_abundance=True) == {10: 3, 20: 2}
def test_max_hash(track_abundance): # test behavior with max_hash mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35) mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) assert mh.get_mins() == [10, 20, 30] mh.add_hash(40) assert mh.get_mins() == [10, 20, 30] mh.add_hash(36) assert mh.get_mins() == [10, 20, 30]
def test_basic_dna(track_abundance): # verify that MHs of size 1 stay size 1, & act properly as bottom sketches. mh = MinHash(1, 4, track_abundance=track_abundance) mh.add_sequence('ATGC') a = mh.get_mins() mh.add_sequence('GCAT') # this will not get added; hash > ATGC b = mh.get_mins() print(a, b) assert a == b assert len(b) == 1
def test_minhash_abund_add(): # this targets part of bug #319, a segfault caused by invalidation of # std::vector iterators upon vector resizing - in this case, there # was also a bug in inserting into the middle of mins when scaled was set. a = MinHash(0, 10, track_abundance=True, max_hash=5000) n = 0 for i in range(10, 0, -1): a.add_hash(i) n += 1 assert len(a.get_mins()) == n print(len(a.get_mins()))
def test_set_abundance_clear_4(): # setting the abundance of an already set hash should add # the abundances together a = MinHash(20, 5, False, track_abundance=True) a.set_abundances({ 20: 2, 10: 1 }, clear=False) # should also sort the hashes assert a.get_mins(with_abundance=True) == {10: 1, 20: 2} a.set_abundances({20: 1, 10: 2}, clear=False) assert a.get_mins(with_abundance=True) == {10: 3, 20: 3}
def test_dayhoff(track_abundance): # verify that we can hash to dayhoff-encoded protein/aa sequences mh_dayhoff = MinHash(10, 6, is_protein=True, dayhoff=True, hp=False, track_abundance=track_abundance) mh_dayhoff.add_sequence('ACTGAC') assert len(mh_dayhoff.get_mins()) == 2 # verify that dayhoff-encoded hashes are different from protein/aa hashes mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance) mh_protein.add_sequence('ACTGAC') assert len(mh_protein.get_mins()) == 2 assert mh_protein.get_mins() != mh_dayhoff.get_mins()
def test_protein_hp(track_abundance, hp): # verify that we can hash protein/aa sequences mh = MinHash(10, 6, True, dayhoff=False, hp=hp, track_abundance=track_abundance) mh.add_protein('AGYYG') if hp: assert len(mh.get_mins()) == 1 else: assert len(mh.get_mins()) == 4
def test_scaled(track_abundance): # test behavior with scaled (alt to max_hash) scaled = get_scaled_for_max_hash(35) print('XX', scaled, get_max_hash_for_scaled(scaled)) mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled) assert mh.max_hash == 35 mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) assert mh.get_mins() == [10, 20, 30] mh.add_hash(40) assert mh.get_mins() == [10, 20, 30] mh.add_hash(36) assert mh.get_mins() == [10, 20, 30]
def test_hp(track_abundance): # verify that we can hash to hp-encoded protein/aa sequences mh_hp = MinHash(10, 6, is_protein=True, dayhoff=False, hp=True, track_abundance=track_abundance) assert mh_hp.moltype == 'hp' mh_hp.add_sequence('ACTGAC') assert len(mh_hp.get_mins()) == 2 # verify that hp-encoded hashes are different from protein/aa hashes mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance) mh_protein.add_sequence('ACTGAC') assert len(mh_protein.get_mins()) == 2 assert mh_protein.get_mins() != mh_hp.get_mins()
def test_bytes_dna(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) mh.add_sequence('ATGC') mh.add_sequence(b'ATGC') mh.add_sequence(u'ATGC') a = mh.get_mins() mh.add_sequence('GCAT') # this will not get added; hash > ATGC mh.add_sequence(b'GCAT') # this will not get added; hash > ATGC mh.add_sequence(u'GCAT') # this will not get added; hash > ATGC b = mh.get_mins() print(a, b) assert a == b assert len(b) == 1
def test_bytes_dna(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) mh.add_sequence('ATGC') mh.add_sequence(b'ATGC') mh.add_sequence('ATGC') a = mh.get_mins() mh.add_sequence('GCAT') # this will not get added; hash > ATGC mh.add_sequence(b'GCAT') # this will not get added; hash > ATGC mh.add_sequence('GCAT') # this will not get added; hash > ATGC b = mh.get_mins() print(a, b) assert a == b assert len(b) == 1
def test_bytes_protein_hp(track_abundance, hp): # verify that we can hash protein/aa sequences mh = MinHash(10, 6, True, dayhoff=False, hp=hp, track_abundance=track_abundance) expected_moltype = 'protein' if hp: expected_moltype = 'hp' assert mh.moltype == expected_moltype mh.add_protein('AGYYG') mh.add_protein(u'AGYYG') mh.add_protein(b'AGYYG') if hp: assert len(mh.get_mins()) == 1 else: assert len(mh.get_mins()) == 4
def test_abundance_count_common(): a = MinHash(20, 5, False, track_abundance=True) b = MinHash(20, 5, False, track_abundance=False) a.add_sequence('AAAAA') a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 2} b.add_sequence('AAAAA') b.add_sequence('GGGGG') assert a.count_common(b) == 1 assert a.count_common(b) == b.count_common(a) assert b.get_mins(with_abundance=True) == [2110480117637990133, 10798773792509008305]
def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) # can't compare different sizes without downsampling with pytest.raises(TypeError): d.compare(a) a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) assert d.compare(a) == 1.0 c = c.downsample_n(b.num) assert c.compare(b) == 1.0
def test_bytes_protein(track_abundance): # verify that we can hash protein/aa sequences mh = MinHash(10, 6, True, track_abundance=track_abundance) mh.add_protein('AGYYG') mh.add_protein(u'AGYYG') mh.add_protein(b'AGYYG') assert len(mh.get_mins()) == 4
def test_reset_abundance_initialized(): a = MinHash(1, 4, track_abundance=True) a.add_sequence('ATGC') # If we had a minhash with abundances and drop it, this shouldn't fail. # Convert from Abundance to Regular MinHash a.track_abundance = False assert a.get_mins(with_abundance=True) == [12415348535738636339]
def test_protein_dayhoff(track_abundance, dayhoff): # verify that we can hash protein/aa sequences mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance) mh.add_protein('AGYYG') assert len(mh.get_mins()) == 4
def test_bytes_protein_dayhoff(track_abundance, dayhoff): # verify that we can hash protein/aa sequences mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance) expected_moltype = 'protein' if dayhoff: expected_moltype = 'dayhoff' assert mh.moltype == expected_moltype mh.add_protein('AGYYG') mh.add_protein('AGYYG') mh.add_protein(b'AGYYG') assert len(mh.get_mins()) == 4
def test_add_many(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) a.add_many(list(range(0, 100, 2))) a.add_many(list(range(0, 100, 2))) assert len(a) == 50 assert all(c % 2 == 0 for c in a.get_mins()) for h in range(0, 100, 2): b.add_hash(h) b.add_hash(h) assert len(b) == 50 assert a == b
def test_pickle_scaled(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, scaled=922337203685477632) for i in range(0, 40, 2): a.add_hash(i) b = pickle.loads(pickle.dumps(a)) assert a.ksize == b.ksize assert b.num == a.num assert b.max_hash == a.max_hash assert b.max_hash == 20 assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.get_mins()) == len(a.get_mins()) assert len(b.get_mins()) == 11 assert a.scaled == b.scaled assert b.scaled != 0
def test_pickle_max_hash(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, max_hash=20) for i in range(0, 40, 2): a.add_hash(i) b = pickle.loads(pickle.dumps(a)) assert a.ksize == b.ksize assert b.num == a.num assert b.max_hash == a.max_hash assert b.max_hash == 20 assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.get_mins()) == len(a.get_mins()) assert len(b.get_mins()) == 11 assert a.scaled == b.scaled assert b.scaled != 0
def test_remove_many(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) a.add_many(list(range(0, 100, 2))) orig_sig = signature.SourmashSignature(a) orig_md5 = orig_sig.md5sum() a.remove_many(list(range(0, 100, 3))) new_sig = signature.SourmashSignature(a) new_md5 = new_sig.md5sum() assert orig_md5 == "f1cc295157374f5c07cfca5f867188a1" assert new_md5 == "dd93fa319ef57f4a019c59ee1a8c73e2" assert orig_md5 != new_md5 assert len(a) == 33 assert all(c % 6 != 0 for c in a.get_mins())
class TimeMinAbundanceSuite(TimeMinHashSuite): def setup(self): TimeMinHashSuite.setup(self) self.mh = MinHash(500, 21, track_abundance=True) self.populated_mh = MinHash(500, 21, track_abundance=True) for seq in self.sequences: self.populated_mh.add_sequence(seq) def time_get_mins_abundance(self): mh = self.populated_mh for i in range(500): mh.get_mins(with_abundance=True) def time_set_abundances(self): mh = self.mh mins = self.populated_mh.get_mins(with_abundance=True) for i in range(500): mh.set_abundances(mins)
def test_intersection_errors(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) c = MinHash(30, 10, track_abundance=track_abundance) a.add_sequence("TGCCGCCCAGCA") b.add_sequence("TGCCGCCCAGCA") common = set(a.get_mins()) combined_size = 3 intersection, size = a.intersection(b, in_common=False) assert intersection == set() assert combined_size == size with pytest.raises(TypeError): a.intersection(set()) with pytest.raises(TypeError): a.intersection(c)
def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) # can't use jaccard on different nums without downsampling with pytest.raises(TypeError): d.jaccard(a) a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) if track_abundance: assert round(d.similarity(a), 3) == 0.91 else: assert round(d.similarity(a), 3) == 1.0 c = c.downsample_n(b.num) if track_abundance: assert round(c.similarity(b), 3) == 0.91 else: assert c.similarity(b) == 1.0
def test_short_sequence(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance) a.add_sequence('GGGG') # adding a short sequence should fail silently assert len(a.get_mins()) == 0
def test_mh_unsigned_long_long(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) a.add_hash(9227159859419181011) # too big for a C long int. assert 9227159859419181011 in a.get_mins()
def test_mh_len(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) assert a.get_mins() == list(range(0, 40, 2))
def test_intersection_1(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) a.add_sequence('TGCCGCCCAGCA') b.add_sequence('TGCCGCCCAGCA') common = set(a.get_mins()) combined_size = 3 intersection, size = a.intersection(b, in_common=True) assert intersection == common assert combined_size == size intersection, size = b.intersection(b, in_common=True) assert intersection == common assert combined_size == size intersection, size = b.intersection(a, in_common=True) assert intersection == common assert combined_size == size intersection, size = a.intersection(a, in_common=True) assert intersection == common assert combined_size == size # add same sequence again b.add_sequence('TGCCGCCCAGCA') intersection, size = a.intersection(b, in_common=True) assert intersection == common assert combined_size == size intersection, size = b.intersection(b, in_common=True) assert intersection == common assert combined_size == size intersection, size = b.intersection(a, in_common=True) assert intersection == common assert combined_size == size intersection, size = a.intersection(a, in_common=True) assert intersection == common assert combined_size == size a.add_sequence('GTCCGCCCAGTGA') b.add_sequence('GTCCGCCCAGTGG') new_in_common = set(a.get_mins()).intersection(set(b.get_mins())) new_combined_size = 8 intersection, size = a.intersection(b, in_common=True) assert intersection == new_in_common assert size == new_combined_size intersection, size = b.intersection(a, in_common=True) assert intersection == new_in_common assert size == new_combined_size intersection, size = a.intersection(a, in_common=True) assert intersection == set(a.get_mins()) intersection, size = b.intersection(b, in_common=True) assert intersection == set(b.get_mins())
def test_intersection_1(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) a.add_sequence('TGCCGCCCAGCA') b.add_sequence('TGCCGCCCAGCA') common = set(a.get_mins()) combined_size = 3 intersection, size = a.intersection(b) assert intersection == common assert combined_size == size intersection, size = b.intersection(b) assert intersection == common assert combined_size == size intersection, size = b.intersection(a) assert intersection == common assert combined_size == size intersection, size = a.intersection(a) assert intersection == common assert combined_size == size # add same sequence again b.add_sequence('TGCCGCCCAGCA') intersection, size = a.intersection(b) assert intersection == common assert combined_size == size intersection, size = b.intersection(b) assert intersection == common assert combined_size == size intersection, size = b.intersection(a) assert intersection == common assert combined_size == size intersection, size = a.intersection(a) assert intersection == common assert combined_size == size a.add_sequence('GTCCGCCCAGTGA') b.add_sequence('GTCCGCCCAGTGG') new_in_common = set(a.get_mins()).intersection(set(b.get_mins())) new_combined_size = 8 intersection, size = a.intersection(b) assert intersection == new_in_common assert size == new_combined_size intersection, size = b.intersection(a) assert intersection == new_in_common assert size == new_combined_size intersection, size = a.intersection(a) assert intersection == set(a.get_mins()) intersection, size = b.intersection(b) assert intersection == set(b.get_mins())
def test_protein_short(track_abundance): # verify that we can hash protein/aa sequences mh = MinHash(10, 9, True, track_abundance=track_abundance) mh.add_protein('AG') assert len(mh.get_mins()) == 0, mh.get_mins()