예제 #1
0
    def test_cluster_threshold(self):
        """Expected error for threshold to similarity should be reasonable"""
        n_tests = 50
        dim = 15

        tot_err = 0
        for test in range(n_tests):
            # Get some sets and their similarities
            sets = (randset(), randset())
            jsim = jaccard_similarity(*sets)

            # Find the threshold at which they cluster together
            for threshold in range(1, 100, 5):
                threshold = float(threshold) / 100
                bandwidth = get_bandwidth(dim, threshold)
                num_bands = int(dim / bandwidth)
                cluster = Cluster(width=num_bands * bandwidth,
                                  bandwidth=bandwidth)
                cluster.add_item(sets[0])
                cluster.add_item(sets[1])
                if len(cluster.get_clusters()) == 2:
                    tot_err += abs(jsim - threshold)
                    break
        avg_error = float(tot_err) / n_tests
        self.assertLessEqual(avg_error, 0.30)
예제 #2
0
    def test_signature_similarity(self):
        """The probability that two sets' signatures match at some index are
        equal is equal to the Jaccard similarity between the two
        """
        n_tests = 100
        expected_error = 1.0 / 10  # Expected error is O(1/sqrt(dim))
        mh = MinHashSignature(10 * 10)
        err = 0.0

        for _ in xrange(n_tests):
            # Create random sets and their signatures
            sets = (randset(), randset())
            sigs = map(mh.get_signature, sets)

            # Calculate true Jaccard similarity, and sim of signatures
            jsim = jaccard_similarity(*sets)
            ssim = sigsim(*sigs, dim=100)

            # Accumulate error
            err += abs(jsim - ssim)

        # Over n_tests large, we should be within upper bound of expected error
        avg_err = err / n_tests
        self.assertGreaterEqual(
            expected_error,
            avg_err,
            msg="Accuracy test failed. (avg error: %f)" % avg_err)
예제 #3
0
def test_jaccard_nan():
    """Returns NaN for empty set
    """
    sim = jaccard_similarity([], [])
    assert_true(np.isnan(sim))