def test_cluster_threshold(self): """Expected error for threshold to similarity should be reasonable""" n_tests = 50 dim = 15 tot_err = 0 for test in range(n_tests): # Get some sets and their similarities sets = (randset(), randset()) jsim = jaccard_similarity(*sets) # Find the threshold at which they cluster together for threshold in range(1, 100, 5): threshold = float(threshold) / 100 bandwidth = get_bandwidth(dim, threshold) num_bands = int(dim / bandwidth) cluster = Cluster(width=num_bands * bandwidth, bandwidth=bandwidth) cluster.add_item(sets[0]) cluster.add_item(sets[1]) if len(cluster.get_clusters()) == 2: tot_err += abs(jsim - threshold) break avg_error = float(tot_err) / n_tests self.assertLessEqual(avg_error, 0.30)
def test_signature_similarity(self): """The probability that two sets' signatures match at some index are equal is equal to the Jaccard similarity between the two """ n_tests = 100 expected_error = 1.0 / 10 # Expected error is O(1/sqrt(dim)) mh = MinHashSignature(10 * 10) err = 0.0 for _ in xrange(n_tests): # Create random sets and their signatures sets = (randset(), randset()) sigs = map(mh.get_signature, sets) # Calculate true Jaccard similarity, and sim of signatures jsim = jaccard_similarity(*sets) ssim = sigsim(*sigs, dim=100) # Accumulate error err += abs(jsim - ssim) # Over n_tests large, we should be within upper bound of expected error avg_err = err / n_tests self.assertGreaterEqual( expected_error, avg_err, msg="Accuracy test failed. (avg error: %f)" % avg_err)
def test_jaccard_nan(): """Returns NaN for empty set """ sim = jaccard_similarity([], []) assert_true(np.isnan(sim))