Пример #1
0
def test_cluster_threshold():
    """Expected error for threshold to similarity should be reasonable"""
    n_tests = 50
    dim = 15
    expected_error = 0.20

    tot_err = 0
    for test in range(n_tests):
        # Get some sets and their similarities
        sets = (randset(), randset())
        jsim = jaccard_sim(*sets)

        # Find the threshold at which they cluster together
        for threshold in range(1, 100, 5):
            threshold = float(threshold) / 100
            cluster = Cluster(dim, threshold)
            cluster.add_set(sets[0])
            cluster.add_set(sets[1])
            if len(cluster.get_sets()) == 2:
                tot_err += abs(jsim - threshold)
                break
    avg_err = float(tot_err) / n_tests
    assert avg_err <= expected_error
Пример #2
0
def test_cluster_threshold():
    """Expected error for threshold to similarity should be reasonable"""
    n_tests = 50
    dim = 15
    expected_error = 0.20

    tot_err = 0
    for test in range(n_tests):
        # Get some sets and their similarities
        sets = (randset(), randset())
        jsim = jaccard_sim(*sets)

        # Find the threshold at which they cluster together
        for threshold in range(1, 100, 5):
            threshold = float(threshold) / 100
            cluster = Cluster(dim, threshold)
            cluster.add_set(sets[0])
            cluster.add_set(sets[1])
            if len(cluster.get_sets()) == 2:
                tot_err += abs(jsim - threshold)
                break
    avg_err = float(tot_err) / n_tests
    assert avg_err <= expected_error
Пример #3
0
def test_signature_similarity():
    """The probability that two sets' signatures match at some index
    are equal is equal to the Jaccard similarity between the two"""
    dim = 100
    n_tests = 100
    expected_error = 1 / sqrt(dim)  # Expected error is O(1/sqrt(dim))
    mh = MinHashSignature(dim)
    err = 0.0

    for test in range(n_tests):
        # Create random sets and their signatures
        sets = (randset(), randset())
        sigs = map(mh.sign, sets)

        # Calculate true jaccard similarity, and sim of signatures
        jsim = jaccard_sim(*sets)
        ssim = sigsim(*sigs, dim=dim)

        # Accumulate error
        err += abs(jsim - ssim)

    # Over n_tests large, we should be within upper bound of expected error.
    avg_err = err / n_tests
    assert expected_error >= avg_err, "Accuracy test failed. (avg error: %f)" % avg_err
Пример #4
0
def test_signature_similarity():
    """The probability that two sets' signatures match at some index
    are equal is equal to the Jaccard similarity between the two"""
    dim = 100
    n_tests = 100
    expected_error = 1 / sqrt(dim) # Expected error is O(1/sqrt(dim))
    mh = MinHashSignature(dim)
    err = 0.0

    for test in range(n_tests):
        # Create random sets and their signatures
        sets = (randset(), randset())
        sigs = map(mh.sign, sets)

        # Calculate true jaccard similarity, and sim of signatures
        jsim = jaccard_sim(*sets)
        ssim = sigsim(*sigs, dim=dim)

        # Accumulate error
        err += abs(jsim - ssim)

    # Over n_tests large, we should be within upper bound of expected error.
    avg_err = err / n_tests
    assert expected_error >= avg_err, "Accuracy test failed. (avg error: %f)" % avg_err