def test_mh_count_common(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10
def test_mh_count_common(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10
def test_abundance_count_common(): a = MinHash(20, 5, False, track_abundance=True) b = MinHash(20, 5, False, track_abundance=False) a.add_sequence('AAAAA') a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 2} b.add_sequence('AAAAA') b.add_sequence('GGGGG') assert a.count_common(b) == 1 assert a.count_common(b) == b.count_common(a) assert b.get_mins(with_abundance=True) == [2110480117637990133, 10798773792509008305]
def test_mh_asymmetric(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10 with pytest.raises(TypeError): a.compare(b) a = a.downsample_n(10) assert a.compare(b) == 0.5 assert b.compare(a) == 0.5
def test_mh_asymmetric(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10 with pytest.raises(TypeError): a.compare(b) a = a.downsample_n(10) assert a.compare(b) == 0.5 assert b.compare(a) == 0.5
def test_mh_jaccard_asymmetric_num(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10 # with 'jaccard', this will raise an error b/c different num with pytest.raises(TypeError): a.jaccard(b) a = a.downsample_n(10) # CTB note: this used to be 'compare', is now 'jaccard' assert a.jaccard(b) == 0.5 assert b.jaccard(a) == 0.5
def test_abundance_simple_2(): a = MinHash(20, 5, False, track_abundance=True) b = MinHash(20, 5, False, track_abundance=True) a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 1} a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 2} b.add_sequence('AAAAA') assert a.count_common(b) == 1
def test_abundance_simple_2(): a = MinHash(20, 5, False, track_abundance=True) b = MinHash(20, 5, False, track_abundance=True) a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 1} a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 2} b.add_sequence('AAAAA') assert a.count_common(b) == 1
def test_mh_count_common_notmh(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance) b = set() with pytest.raises(TypeError): a.count_common(b)
def test_mh_count_common_diff_ksize(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance) b = MinHash(20, 6, track_abundance=track_abundance) with pytest.raises(ValueError): a.count_common(b)
def test_mh_count_common_diff_seed(track_abundance): a = MinHash(20, 5, False, track_abundance=track_abundance, seed=1) b = MinHash(20, 5, True, track_abundance=track_abundance, seed=2) with pytest.raises(ValueError): a.count_common(b)
def test_mh_count_common_diff_maxhash(track_abundance): a = MinHash(0, 5, False, track_abundance=track_abundance, max_hash=1) b = MinHash(0, 5, True, track_abundance=track_abundance, max_hash=2) with pytest.raises(ValueError): a.count_common(b)
def test_mh_count_common_diff_ksize(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance) b = MinHash(20, 6, track_abundance=track_abundance) with pytest.raises(ValueError): a.count_common(b)
def test_mh_count_common_diff_seed(track_abundance): a = MinHash(20, 5, False, track_abundance=track_abundance, seed=1) b = MinHash(20, 5, True, track_abundance=track_abundance, seed=2) with pytest.raises(ValueError): a.count_common(b)
def test_mh_count_common_diff_maxhash(track_abundance): a = MinHash(0, 5, False, track_abundance=track_abundance, max_hash=1) b = MinHash(0, 5, True, track_abundance=track_abundance, max_hash=2) with pytest.raises(ValueError): a.count_common(b)