def test_div_zero_contained(track_abundance): # verify that empty MHs do not yield divide by zero errors for contained_by mh = MinHash(1, 4, track_abundance=track_abundance) mh2 = mh.copy_and_clear() mh.add_sequence('ATGC') assert mh.contained_by(mh2) == 0 assert mh2.contained_by(mh) == 0
def test_basic_dna_bad(track_abundance): # test behavior on bad DNA mh = MinHash(1, 4, track_abundance=track_abundance) with pytest.raises(ValueError) as e: mh.add_sequence('ATGR') print(e) assert 'invalid DNA character in input k-mer: ATGR' in str(e)
def test_abundance_simple(): a = MinHash(20, 5, False, track_abundance=True) a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 1} a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}
def test_consume_lowercase(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) a.add_sequence( 'TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower()) b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert a.compare(b) == 1.0 assert b.compare(b) == 1.0 assert b.compare(a) == 1.0 assert a.compare(a) == 1.0
class TimeMinHashSuite: def setup(self): self.mh = MinHash(500, 21, track_abundance=False) self.sequences = load_sequences(get_test_data('ecoli.genes.fna')) * 10 self.populated_mh = MinHash(500, 21, track_abundance=False) for seq in self.sequences: self.populated_mh.add_sequence(seq) def time_add_sequence(self): mh = self.mh sequences = self.sequences for seq in sequences: mh.add_sequence(seq) def time_get_mins(self): mh = self.populated_mh for i in range(500): mh.get_mins() def time_add_hash(self): mh = self.mh for i in range(10000): mh.add_hash(i) def time_compare(self): mh = self.mh other_mh = self.populated_mh for i in range(500): mh.compare(other_mh) def time_count_common(self): mh = self.mh other_mh = self.populated_mh for i in range(500): mh.count_common(other_mh) def time_merge(self): mh = self.mh other_mh = self.populated_mh for i in range(500): mh.merge(other_mh) def time_copy(self): mh = self.populated_mh for i in range(500): mh.__copy__() def time_concat(self): mh = self.mh other_mh = self.populated_mh for i in range(500): mh += other_mh
def test_basic_dna(track_abundance): # verify that MHs of size 1 stay size 1, & act properly as bottom sketches. mh = MinHash(1, 4, track_abundance=track_abundance) mh.add_sequence('ATGC') a = mh.get_mins() mh.add_sequence('GCAT') # this will not get added; hash > ATGC b = mh.get_mins() print(a, b) assert a == b assert len(b) == 1
def test_abundance_compare(): a = MinHash(20, 10, track_abundance=True) b = MinHash(20, 10, track_abundance=False) a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert a.compare(b) == 1.0 assert b.compare(b) == 1.0 assert b.compare(a) == 1.0 assert a.compare(a) == 1.0 # add same sequence again b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert a.compare(b) == 1.0 assert b.compare(b) == 1.0 assert b.compare(a) == 1.0 assert a.compare(a) == 1.0 b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT') x = a.compare(b) assert x >= 0.3, x x = b.compare(a) assert x >= 0.3, x assert a.compare(a) == 1.0 assert b.compare(b) == 1.0
def test_abundance_count_common(): a = MinHash(20, 5, False, track_abundance=True) b = MinHash(20, 5, False, track_abundance=False) a.add_sequence('AAAAA') a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 2} b.add_sequence('AAAAA') b.add_sequence('GGGGG') assert a.count_common(b) == 1 assert a.count_common(b) == b.count_common(a) assert b.get_mins(with_abundance=True) == [2110480117637990133, 10798773792509008305]
class TimeMinAbundanceSuite(TimeMinHashSuite): def setup(self): TimeMinHashSuite.setup(self) self.mh = MinHash(500, 21, track_abundance=True) self.populated_mh = MinHash(500, 21, track_abundance=True) for seq in self.sequences: self.populated_mh.add_sequence(seq) def time_get_mins_abundance(self): mh = self.populated_mh for i in range(500): mh.get_mins(with_abundance=True) def time_set_abundances(self): mh = self.mh mins = self.populated_mh.get_mins(with_abundance=True) for i in range(500): mh.set_abundances(mins)
def test_bytes_dna(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) mh.add_sequence('ATGC') mh.add_sequence(b'ATGC') mh.add_sequence(u'ATGC') a = mh.get_mins() mh.add_sequence('GCAT') # this will not get added; hash > ATGC mh.add_sequence(b'GCAT') # this will not get added; hash > ATGC mh.add_sequence(u'GCAT') # this will not get added; hash > ATGC b = mh.get_mins() print(a, b) assert a == b assert len(b) == 1
def test_basic_dna_bad_force(track_abundance): # test behavior on bad DNA; use 100 so multiple hashes get added. mh = MinHash(100, 4, track_abundance=track_abundance) assert len(mh.get_mins()) == 0 mh.add_sequence('ATGN', True) # ambiguous kmer skipped. assert len(mh.get_mins()) == 0 mh.add_sequence('AATGN', True) # but good k-mers still used. assert len(mh.get_mins()) == 1 mh.add_sequence('AATG', True) # checking that right kmer was added assert len(mh.get_mins()) == 1 # (only 1 hash <- this is a dup)
def test_basic_dna_bad_force_2(track_abundance): # test behavior on bad DNA mh = MinHash(100, 4, track_abundance=track_abundance) assert len(mh.get_mins()) == 0 mh.add_sequence('AAGNCGG', True) # ambiguous kmers skipped. assert len(mh.get_mins()) == 0 mh.add_sequence('AATGNGCGG', True) # ambiguous kmers skipped. assert len(mh.get_mins()) == 2 mh.add_sequence('AATG', True) # checking that right kmers were added mh.add_sequence('GCGG', True) assert len(mh.get_mins()) == 2 # (only 2 hashes should be there)
def test_intersection_1(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) a.add_sequence('TGCCGCCCAGCA') b.add_sequence('TGCCGCCCAGCA') common = set(a.get_mins()) combined_size = 3 intersection, size = a.intersection(b) assert intersection == common assert combined_size == size intersection, size = b.intersection(b) assert intersection == common assert combined_size == size intersection, size = b.intersection(a) assert intersection == common assert combined_size == size intersection, size = a.intersection(a) assert intersection == common assert combined_size == size # add same sequence again b.add_sequence('TGCCGCCCAGCA') intersection, size = a.intersection(b) assert intersection == common assert combined_size == size intersection, size = b.intersection(b) assert intersection == common assert combined_size == size intersection, size = b.intersection(a) assert intersection == common assert combined_size == size intersection, size = a.intersection(a) assert intersection == common assert combined_size == size a.add_sequence('GTCCGCCCAGTGA') b.add_sequence('GTCCGCCCAGTGG') new_in_common = set(a.get_mins()).intersection(set(b.get_mins())) new_combined_size = 8 intersection, size = a.intersection(b) assert intersection == new_in_common assert size == new_combined_size intersection, size = b.intersection(a) assert intersection == new_in_common assert size == new_combined_size intersection, size = a.intersection(a) assert intersection == set(a.get_mins()) intersection, size = b.intersection(b) assert intersection == set(b.get_mins())
def test_basic_dna_bad(track_abundance): # test behavior on bad DNA mh = MinHash(1, 4, track_abundance=track_abundance) with pytest.raises(ValueError): mh.add_sequence('ATGR')
def test_short_sequence(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance) a.add_sequence('GGGG') # adding a short sequence should fail silently assert len(a.get_mins()) == 0
def test_mh_copy(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') b = a.__copy__() assert b.compare(a) == 1.0
def test_mh_len(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) assert len(a) == 20 a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert len(a) == 20