def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) # can't compare different sizes without downsampling with pytest.raises(TypeError): d.compare(a) a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) assert d.compare(a) == 1.0 c = c.downsample_n(b.num) assert c.compare(b) == 1.0
def test_mh_inplace_concat_asymmetric(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.__copy__() c += b d = b.__copy__() d += a assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) try: d.compare(a) except TypeError as exc: assert 'must have same num' in str(exc) a = a.downsample_n(d.num) assert d.compare(a) == 1.0 # see: d += a, above. c = c.downsample_n(b.num) assert c.compare(b) == 0.5
def test_mh_asymmetric(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10 with pytest.raises(TypeError): a.compare(b) a = a.downsample_n(10) assert a.compare(b) == 0.5 assert b.compare(a) == 0.5
def test_mh_jaccard_asymmetric_num(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10 # with 'jaccard', this will raise an error b/c different num with pytest.raises(TypeError): a.jaccard(b) a = a.downsample_n(10) # CTB note: this used to be 'compare', is now 'jaccard' assert a.jaccard(b) == 0.5 assert b.jaccard(a) == 0.5
def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) # can't use jaccard on different nums without downsampling with pytest.raises(TypeError): d.jaccard(a) a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) if track_abundance: assert round(d.similarity(a), 3) == 0.91 else: assert round(d.similarity(a), 3) == 1.0 c = c.downsample_n(b.num) if track_abundance: assert round(c.similarity(b), 3) == 0.91 else: assert c.similarity(b) == 1.0
def test_mh_downsample_n_error(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) with pytest.raises(ValueError): a.downsample_n(30)