Exemplo n.º 1
0
def test_jaccard_integers(data):
    set1 = data.draw(st.sets(elements=st.integers(), average_size=100, min_size=1))
    overlap_size = data.draw(st.integers(min_value=0, max_value=len(set1)))
    if set1:
        set2 = set(random.sample(set1, overlap_size))
    else:
        set2 = set([])
    set2 = set2.union(data.draw(st.sets(elements=st.integers(), average_size=50, min_size=1)))
    score = len(set1.intersection(set2))/len(set1.union(set2))*1.0
    note("{}, {}".format(embed.jaccard(set1, set2), score))
    assert(embed.jaccard(set1, set2) == score)
    assert(score <= 1)
    assert(score >= 0)
Exemplo n.º 2
0
def test_pairs_list_similarity(context, label, data, metric):
    list1 = data.draw(st.lists(elements=st.integers(), unique=True, average_size=100))

    # check score with myself is 1
    if list1:
        _, (_, self_score) = embed.pairs_list_similarity(context, label, list1, list1, how=metric)
        note("{} score of list with itself: {}".format(metric, self_score))
        note(list1)
        assert(self_score == 1)

    # check score is always between 0 and 1
    overlap_size = data.draw(st.integers(min_value=0, max_value=len(list1)))
    if list1 and overlap_size:
        list2 = random.sample(list1, overlap_size)
        min_value = max(list1) + 1
    else:
        list2 = []
        min_value = None
    list2 = list2 + data.draw(st.lists(elements=st.integers(min_value=min_value), unique=True, average_size=50))

    ret_context, (ret_label, score) = embed.pairs_list_similarity(context, label, list1, list2, how=metric)
    assert(label == ret_label)
    assert(context == ret_context)

    assert(score <= 1)
    assert(score >= 0)

    # check that we computed the correct score
    if metric == 'jaccard':
        assert(score == embed.jaccard(set(list1), set(list2)))
    elif metric == 'adapted-ktau':
        assert(score == embed.adaptedKendallTau(list1, list2))
Exemplo n.º 3
0
def test_akt_vs_jaccard(data):
    list1 = data.draw(st.lists(elements=st.integers(), unique=True, average_size=100))

    # top elements from list 1 are the ones that that overlap with list 2, akt >=jac
    # only overlap on up to %90 of the elements
    overlap_size = data.draw(st.integers(min_value=0, max_value=int(len(list1)*.8)))

    if list1 and overlap_size:
        list2 = list1[:overlap_size]
        random.shuffle(list2)
        min_value = max(list1) + 1
    elif not list1:
        list2 = []
        min_value = None
    else:
        list2 = []
        min_value = max(list1) + 1

    list2 = list2 + data.draw(st.lists(elements=st.integers(min_value=min_value), unique=True, average_size=50))
    note((list1, list2, list1==list2))

    jac_score = embed.jaccard(set(list1), set(list2))
    akt_score = embed.adaptedKendallTau(list1, list2)
    if (overlap_size > 0):
        assert(akt_score >= jac_score)
    else:
        assert(akt_score <= jac_score)

    # check that shuffling elements in the same list results in an akt_score of <= 1 and jac score of 1
    list3 = list2
    random.shuffle(list3)
    jac_score = embed.jaccard(set(list3), set(list2))
    akt_score = embed.adaptedKendallTau(list3, list2)
    assert(akt_score <= jac_score)
    if list2:
        assert(jac_score == 1)
    else:
        assert(jac_score == 0)