Exemplo n.º 1
0
def test_parallel_get_pairwise_comparisons(neighbors_object_list, data, metric):
    neighbors_list = []
    n_neighbors_list = []
    for neighbors_object in neighbors_object_list:
        neighbors_list.append(neighbors_object.neighbors)
        n_neighbors_list.append(neighbors_object.n_neighbors)

    n_neighbors = data.draw(st.integers(min_value=2, max_value=min(n_neighbors_list)))

    # set parameters so we don't hang on small examples
    num_comparisons = len(list(embed.knn_iterator(neighbors_list)))
    if num_comparisons < 2000:
        n_proc = 1
        chunksize = None
    else:
        n_proc = 2
        chunksize = 1000

    comparisons = embed.get_pairwise_comparisons(neighbors_list, n_neighbors=n_neighbors, how=metric)
    p_comparisons = embed.parallel_get_pairwise_comparisons(neighbors_list, n_neighbors=n_neighbors, how=metric, n_proc=n_proc, chunksize=chunksize)

    for comparison in comparisons:
        i1, i2 = comparison[0], comparison[1]
        for i in range(len(p_comparisons)):
            if i1 == p_comparisons[i][0] and i2 == p_comparisons[i][1]:
                assert(comparison_equality(comparison[2], p_comparisons[i][2]))
Exemplo n.º 2
0
def test_parallel_get_pairwise_comparisons_varying_k(neighbors_object_list, data, metric):
    neighbors_list = []
    n_neighbors_list = []
    for neighbors_object in neighbors_object_list:
        neighbors_list.append(neighbors_object.neighbors)
        n_neighbors_list.append(neighbors_object.n_neighbors)

    n_neighbors = data.draw(st.integers(min_value=2, max_value=min(n_neighbors_list)))
    max_k = data.draw(st.integers(min_value=2, max_value=n_neighbors))


    # set parameters so we don't hang on small examples
    num_comparisons = len(list(embed.knn_iterator(neighbors_list, max_k)))
    if num_comparisons < 2000:
        n_proc = 1
        chunksize = None
    else:
        n_proc = 2
        chunksize = 1000

    comparisons = embed.parallel_get_pairwise_comparisons_varying_k(neighbors_list, max_k, how=metric, n_proc=n_proc, chunksize=chunksize)

    for comparison in comparisons:
        i1, i2, k = comparison[0], comparison[1], comparison[2]
        test_comparison = embed.compare_neighbors(neighbors_list[i1][0], neighbors_list[i2][0], n_neighbors=k, how=metric)
        assert(comparison_equality(comparison[3], test_comparison))
Exemplo n.º 3
0
def test_knn_iterate(neighbors_object_list, data):
    n_neighbors_list = []
    neighbors_list = []
    for neighbors_object in neighbors_object_list:
        n_neighbors_list.append(neighbors_object.n_neighbors)
        neighbors_list.append(neighbors_object.neighbors)
    n_neighbors = data.draw(st.integers(min_value=2, max_value=min(n_neighbors_list)))

    knn_iter = embed.knn_iterator(neighbors_list, n_neighbors=n_neighbors)
    knn_iter_list = list(knn_iter)

    # compute the expected length of knn_iter_list
    pairs = [(a, b) for a, b in comb(range(len(neighbors_list)), 2) if a < b]
    knn_length = 0
    for a, b in pairs:
        knn_dict_a, _ = neighbors_list[a]
        knn_dict_b, _ = neighbors_list[b]
        knn_length += len(set(knn_dict_a.keys()).intersection(knn_dict_b.keys()))

    assert(knn_length == len(knn_iter_list))

    # check elements satisfy basic expected properties
    for item in knn_iter_list:
        (i1, i2), word, n1, n2 = item
        assert(i1 < i2)
        note(neighbors_list[i1][0][word])
        note(n1)
        assert(neighbors_list[i1][0][word][1:n_neighbors] == n1)
        assert(neighbors_list[i2][0][word][1:n_neighbors] == n2)
Exemplo n.º 4
0
def test_aggregate_comparisons_stats(neighbors_object_list, data, metric):
    basefilename = "test_comparisons"
    data_path = test_dir
    neighbors_list = []
    n_neighbors_list = []
    metadata_list = []
    for neighbors_object in neighbors_object_list:
        neighbors_list.append(neighbors_object.neighbors)
        n_neighbors_list.append(neighbors_object.n_neighbors)
        metadata_list.append(neighbors_object.metadata)

    n_neighbors = data.draw(st.integers(min_value=2, max_value=min(n_neighbors_list)))

    # test get_pairwise_comparisons output
    comparisons = embed.get_pairwise_comparisons(neighbors_list, n_neighbors=n_neighbors, how=metric)

    results, results_mean, results_variance = embed.aggregate_comparison_stats(comparisons)

    for i, j, (labels, similarity) in comparisons:
        # check symmetry
        assert(results_mean[i, j] == results_mean[j, i])
        assert(results_variance[i, j] == results_variance[j, i])
        describe = scipy.stats.describe(similarity)
        assert(describe == results[i, j])
        assert(results_mean[i, j] == scipy.mean(similarity))
        assert(describe.variance == results_variance[i, j])

   # test on parallel_get_pairwise_comparisons_varying_k output
    max_k = data.draw(st.integers(min_value=2, max_value=n_neighbors))

    # set parameters so we don't hang on small examples
    num_comparisons = len(list(embed.knn_iterator(neighbors_list, max_k)))
    if num_comparisons < 2000:
        n_proc = 1
        chunksize = None
    else:
        n_proc = 2
        chunksize = 1000

    comparisons = embed.parallel_get_pairwise_comparisons_varying_k(neighbors_list, max_k,
                                                                    n_proc=n_proc, chunksize=chunksize, how=metric)
    results, results_mean, results_variance = embed.aggregate_comparison_stats(comparisons, max_k=max_k)

    for i, j, k, (labels, similarity) in comparisons:
        # check symmetry
        assert(k <= max_k)
        assert(results_mean[i, j, k] == results_mean[j, i, k])
        assert(results_variance[i, j, k] == results_variance[j, i, k])
        describe = scipy.stats.describe(similarity)
        assert(describe == results[i, j, k])
        assert(results_mean[i, j, k] == scipy.mean(similarity))
        assert(describe.variance == results_variance[i, j, k])
Exemplo n.º 5
0
def test_save_and_read_comparisons(neighbors_object_list, data, metric):
    basefilename = "test_comparisons"
    data_path = test_dir
    neighbors_list = []
    n_neighbors_list = []
    metadata_list = []
    for neighbors_object in neighbors_object_list:
        neighbors_list.append(neighbors_object.neighbors)
        n_neighbors_list.append(neighbors_object.n_neighbors)
        metadata_list.append(neighbors_object.metadata)

    n_neighbors = data.draw(st.integers(min_value=2, max_value=min(n_neighbors_list)))

    # test out return format of get_pairwise_comparisons
    comparisons = embed.get_pairwise_comparisons(neighbors_list, n_neighbors=n_neighbors, how=metric)
    embed.save_comparisons(basefilename, comparisons=comparisons, metadata=metadata_list, data_path=data_path)
    read_comparisons, read_metadata = embed.read_comparisons(basefilename, data_path=test_dir)

    comparisons_filename = os.path.join(data_path, basefilename + ".comparisons")
    metadata_filename = os.path.join(data_path, basefilename + ".comparisons_metadata")
    os.remove(comparisons_filename)
    os.remove(metadata_filename)

    assert(comparisons == read_comparisons)
    assert(metadata_list == read_metadata)

    # test save/read from parallel_get_pairwise_comparisons_varying_k
    max_k = data.draw(st.integers(min_value=2, max_value=n_neighbors))

    # set parameters so we don't hang on small examples
    num_comparisons = len(list(embed.knn_iterator(neighbors_list, max_k)))
    if num_comparisons < 2000:
        n_proc = 1
        chunksize = None
    else:
        n_proc = 2
        chunksize = 1000

    comparisons = embed.parallel_get_pairwise_comparisons_varying_k(neighbors_list, max_k,
                                                                    n_proc=n_proc, chunksize=chunksize, how=metric)
    embed.save_comparisons(basefilename, max_k=max_k, comparisons=comparisons, metadata=metadata_list, data_path=data_path)
    read_comparisons, read_metadata = embed.read_comparisons(basefilename, data_path=test_dir)

    comparisons_filename = os.path.join(data_path, basefilename + ".comparisons")
    metadata_filename = os.path.join(data_path, basefilename + ".comparisons_metadata")
    os.remove(comparisons_filename)
    os.remove(metadata_filename)

    assert(comparisons == read_comparisons)
    assert(metadata_list == read_metadata)