Пример #1
0
def test_mh_similarity_downsample_errors(track_abundance):
    # test downsample=False (default) argument to MinHash.similarity

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance)

    a_values = {1: 5, 3: 3, 5: 2, 8: 2}
    b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
    if track_abundance:
        a.set_abundances(a_values)
        b.set_abundances(b_values)
    else:
        a.add_many(a_values.keys())
        b.add_many(b_values.keys())

    # error, incompatible max hash
    with pytest.raises(ValueError) as e:
        a.similarity(b, ignore_abundance=True)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        a.similarity(b, ignore_abundance=False)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        b.similarity(a, ignore_abundance=True)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        b.similarity(a, ignore_abundance=False)  # downsample=false
    assert 'mismatch in scaled; comparison fail' in str(e.value)
Пример #2
0
def test_set_abundance():
    a = MinHash(20, 10, track_abundance=False)

    with pytest.raises(RuntimeError) as e:
        a.set_abundances({1: 3, 2: 4})

    assert "track_abundance=True when constructing" in e.value.args[0]
Пример #3
0
def test_set_abundance():
    a = MinHash(20, 10, track_abundance=False)

    with pytest.raises(RuntimeError) as e:
        a.set_abundances({1: 3, 2: 4})

    assert "track_abundance=True when constructing" in e.value.args[0]
Пример #4
0
def test_mh_similarity_downsample_true(track_abundance):
    # verify sim(a, b) == sim(b, a), with and without ignore_abundance

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance)

    a_values = {1: 5, 3: 3, 5: 2, 8: 2}
    b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
    if track_abundance:
        a.set_abundances(a_values)
        b.set_abundances(b_values)
    else:
        a.add_many(a_values.keys())
        b.add_many(b_values.keys())

    # downsample=True => no error; values should match either way
    x = a.similarity(b, ignore_abundance=True, downsample=True)
    y = b.similarity(a, ignore_abundance=True, downsample=True)
    assert x == y

    # downsample=True => no error; values should match either way
    x = a.similarity(b, ignore_abundance=False, downsample=True)
    y = b.similarity(a, ignore_abundance=False, downsample=True)
    assert x == y
Пример #5
0
def test_set_abundances_huge():
    max_hash = 4000000
    a = MinHash(0, 10, track_abundance=True, max_hash=max_hash)

    hashes = list(range(max_hash))
    abundances = itertools.repeat(2)

    a.set_abundances(dict(zip(hashes, abundances)))
Пример #6
0
def test_set_abundance_clear_3():
    a = MinHash(20, 5, False, track_abundance=True)

    a.add_hash(10)
    assert a.get_mins(with_abundance=True) == {10: 1}

    a.set_abundances({20: 1, 30: 4}, clear=False)
    assert a.get_mins(with_abundance=True) == {10: 1, 20: 1, 30: 4}
Пример #7
0
def test_set_abundance_clear_2():
    # default should be clear=True
    a = MinHash(20, 5, False, track_abundance=True)

    a.add_hash(10)
    assert a.get_mins(with_abundance=True) == {10: 1}

    a.set_abundances({20: 2})
    assert a.get_mins(with_abundance=True) == {20: 2}
Пример #8
0
def test_set_abundance_clear():
    # on empty minhash, clear should have no effect
    a = MinHash(20, 5, False, track_abundance=True)
    b = MinHash(20, 5, False, track_abundance=True)

    a.set_abundances({1: 3, 2: 4}, clear=True)
    b.set_abundances({1: 3, 2: 4}, clear=False)

    assert a.get_mins() == b.get_mins()
Пример #9
0
def test_set_abundance_clear_4():
    # setting the abundance of an already set hash should add
    # the abundances together
    a = MinHash(20, 5, False, track_abundance=True)

    a.set_abundances({20: 2, 10: 1}, clear=False)   # should also sort the hashes
    assert a.get_mins(with_abundance=True) == {10: 1, 20: 2}

    a.set_abundances({20: 1, 10: 2}, clear=False)
    assert a.get_mins(with_abundance=True) == {10: 3, 20: 3}
Пример #10
0
def test_mh_angular_similarity_2():
    # check actual angular similarity for a second non-trivial case
    a = MinHash(0, 20, max_hash=100, track_abundance=True)
    b = MinHash(0, 20, max_hash=100, track_abundance=True)
    a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70})
    b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70})

    assert round(a.similarity(b), 4) == 0.9728

    # ignore_abundance => jaccard
    assert a.similarity(b, ignore_abundance=True) == 5. / 7.
Пример #11
0
def test_mh_angular_similarity():
    # check actual angular similarity for a non-trivial case, taken from:
    # https://www.sciencedirect.com/topics/computer-science/cosine-similarity
    # note: angular similarity is 1 - 2*(acos(sim) / pi), when elements
    # are always positive (https://en.wikipedia.org/wiki/Cosine_similarity)
    a = MinHash(0, 20, max_hash=50, track_abundance=True)
    b = MinHash(0, 20, max_hash=50, track_abundance=True)
    a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2})
    b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1})

    cos_sim = 0.9356
    angular_sim = 1 - 2 * math.acos(cos_sim) / math.pi
    assert round(angular_sim, 4) == 0.7703

    assert round(a.similarity(b), 4) == round(angular_sim, 4)
Пример #12
0
def test_mh_similarity_downsample_angular_value():
    # test downsample=True argument to MinHash.similarity

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=True)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=True)

    a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70})
    b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70})

    # the hash=70 will be truncated by downsampling
    sim = a.similarity(b, downsample=True)
    assert round(sim, 4) == 0.7703

    # with ignore_abundance, will be equal to jaccard
    jaccard = a.similarity(b, downsample=True, ignore_abundance=True)
    assert jaccard == 4. / 6.
Пример #13
0
def test_set_abundance_num():
    a = MinHash(2, 10, track_abundance=True)

    a.set_abundances({1: 3, 2: 4})

    assert a.get_mins(with_abundance=True) == {1: 3, 2: 4}