def test_similarity_of_two_sets_using_w_shingles(): print ".....Testing w-shingles (shingling, minhash & calc jaccard similarity)\n" min_values_list_w_shingles = None for shingle, original_document in shingle_generator(faux_generator_string_words(), type=ShingleType.W_SHINGLES): print shingle min_values_list_w_shingles = run(shingle) print "number of min_hash values -> %s" % str(len(min_values_list_w_shingles)) print min_values_list_w_shingles print min_values_list_w_shingles_2 = None for shingle, original_document in shingle_generator(faux_generator_string_words_2(), type=ShingleType.W_SHINGLES): print shingle min_values_list_w_shingles_2 = run(shingle) print "number of min_hash values -> %s" % str(len(min_values_list_w_shingles_2)) print min_values_list_w_shingles_2 print # calculate jaccard similarity - should be approx 44% similar similarity_ratio = jaccard_similarity(set(min_values_list_w_shingles), set(min_values_list_w_shingles_2)) print "Asserting jaccard similarity should be ~44%\n" assert similarity_ratio >= .44
def test_sets_exact_match_returns_1(): #set up faux_set_1 = set(["abcdef"]) faux_set_2 = set(["abcdef"]) #execute results = jaccard_similarity(faux_set_1, faux_set_2) #asserts nt.eq_(results, 1.0)
def test_both_sets_empty(): #set up faux_set_1 = set([]) faux_set_2 = set([]) #execute results = jaccard_similarity(faux_set_1, faux_set_2) #asserts nt.eq_(results, 0)
def test_sets_not_similar_returns_0(): #set up faux_set_1 = set(["abcdef"]) faux_set_2 = set(["test_set_not_similar"]) #execute results = jaccard_similarity(faux_set_1, faux_set_2) #asserts nt.eq_(results, 0.0)
def test_sets_at_least_50_percent_similar(): #set up faux_set_1 = set(["abcdef"]) faux_set_2 = set(["abcd", "abcdef"]) #execute results = jaccard_similarity(faux_set_1, faux_set_2) #asserts assert results >= .50
def test_both_sets_empty(): #set up faux_set_1 = set([]) faux_set_2 = set([]) #execute results = jaccard_similarity(faux_set_1, faux_set_2) #asserts nt.eq_(results,0)
def _calculate_similarity_score(self, document_1, document_2): """ Calculate similarity score for givens documents. :param document_1: :param document_2: :return: 0.0 if score can't be calculated otherwise returns calculated value """ score = 0.0 if document_1 and document_2: shingles_set_1 = document_1.get_shingles_as_set() shingles_set_2 = document_2.get_shingles_as_set() score = jaccard_similarity(shingles_set_1, shingles_set_2) return score