def test_jaccard_integers(data): set1 = data.draw(st.sets(elements=st.integers(), average_size=100, min_size=1)) overlap_size = data.draw(st.integers(min_value=0, max_value=len(set1))) if set1: set2 = set(random.sample(set1, overlap_size)) else: set2 = set([]) set2 = set2.union(data.draw(st.sets(elements=st.integers(), average_size=50, min_size=1))) score = len(set1.intersection(set2))/len(set1.union(set2))*1.0 note("{}, {}".format(embed.jaccard(set1, set2), score)) assert(embed.jaccard(set1, set2) == score) assert(score <= 1) assert(score >= 0)
def test_pairs_list_similarity(context, label, data, metric): list1 = data.draw(st.lists(elements=st.integers(), unique=True, average_size=100)) # check score with myself is 1 if list1: _, (_, self_score) = embed.pairs_list_similarity(context, label, list1, list1, how=metric) note("{} score of list with itself: {}".format(metric, self_score)) note(list1) assert(self_score == 1) # check score is always between 0 and 1 overlap_size = data.draw(st.integers(min_value=0, max_value=len(list1))) if list1 and overlap_size: list2 = random.sample(list1, overlap_size) min_value = max(list1) + 1 else: list2 = [] min_value = None list2 = list2 + data.draw(st.lists(elements=st.integers(min_value=min_value), unique=True, average_size=50)) ret_context, (ret_label, score) = embed.pairs_list_similarity(context, label, list1, list2, how=metric) assert(label == ret_label) assert(context == ret_context) assert(score <= 1) assert(score >= 0) # check that we computed the correct score if metric == 'jaccard': assert(score == embed.jaccard(set(list1), set(list2))) elif metric == 'adapted-ktau': assert(score == embed.adaptedKendallTau(list1, list2))
def test_akt_vs_jaccard(data): list1 = data.draw(st.lists(elements=st.integers(), unique=True, average_size=100)) # top elements from list 1 are the ones that that overlap with list 2, akt >=jac # only overlap on up to %90 of the elements overlap_size = data.draw(st.integers(min_value=0, max_value=int(len(list1)*.8))) if list1 and overlap_size: list2 = list1[:overlap_size] random.shuffle(list2) min_value = max(list1) + 1 elif not list1: list2 = [] min_value = None else: list2 = [] min_value = max(list1) + 1 list2 = list2 + data.draw(st.lists(elements=st.integers(min_value=min_value), unique=True, average_size=50)) note((list1, list2, list1==list2)) jac_score = embed.jaccard(set(list1), set(list2)) akt_score = embed.adaptedKendallTau(list1, list2) if (overlap_size > 0): assert(akt_score >= jac_score) else: assert(akt_score <= jac_score) # check that shuffling elements in the same list results in an akt_score of <= 1 and jac score of 1 list3 = list2 random.shuffle(list3) jac_score = embed.jaccard(set(list3), set(list2)) akt_score = embed.adaptedKendallTau(list3, list2) assert(akt_score <= jac_score) if list2: assert(jac_score == 1) else: assert(jac_score == 0)