def test_cluster_threshold(): """Expected error for threshold to similarity should be reasonable""" n_tests = 50 dim = 15 expected_error = 0.20 tot_err = 0 for test in range(n_tests): # Get some sets and their similarities sets = (randset(), randset()) jsim = jaccard_sim(*sets) # Find the threshold at which they cluster together for threshold in range(1, 100, 5): threshold = float(threshold) / 100 cluster = Cluster(dim, threshold) cluster.add_set(sets[0]) cluster.add_set(sets[1]) if len(cluster.get_sets()) == 2: tot_err += abs(jsim - threshold) break avg_err = float(tot_err) / n_tests assert avg_err <= expected_error
def test_signature_similarity(): """The probability that two sets' signatures match at some index are equal is equal to the Jaccard similarity between the two""" dim = 100 n_tests = 100 expected_error = 1 / sqrt(dim) # Expected error is O(1/sqrt(dim)) mh = MinHashSignature(dim) err = 0.0 for test in range(n_tests): # Create random sets and their signatures sets = (randset(), randset()) sigs = map(mh.sign, sets) # Calculate true jaccard similarity, and sim of signatures jsim = jaccard_sim(*sets) ssim = sigsim(*sigs, dim=dim) # Accumulate error err += abs(jsim - ssim) # Over n_tests large, we should be within upper bound of expected error. avg_err = err / n_tests assert expected_error >= avg_err, "Accuracy test failed. (avg error: %f)" % avg_err