Exemplo n.º 1
0
def test_init_random():
    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    pynndescent_.init_random(
        n_neighbors,
        data,
        current_graph,
        dist,
        dist_args,
        new_rng_state(),
        seed_per_row=True,
    )
    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    current_graph_threaded = utils.make_heap(data.shape[0], n_neighbors)
    threaded.init_random(
        current_graph_threaded,
        data,
        dist,
        dist_args,
        n_neighbors,
        chunk_size=chunk_size,
        rng_state=new_rng_state(),
        parallel=parallel,
        seed_per_row=True,
    )

    assert_allclose(current_graph_threaded, current_graph)
Exemplo n.º 2
0
def initialize_heaps(data,
                     n_neighbors,
                     leaf_array,
                     dist=dist.euclidean,
                     dist_args=()):
    graph_heap = make_heap(data.shape[0], 10)
    search_heap = make_heap(data.shape[0], n_neighbors * 2)
    tried = set([(-1, -1)])
    for n in range(leaf_array.shape[0]):
        for i in range(leaf_array.shape[1]):
            if leaf_array[n, i] < 0:
                break
            for j in range(i + 1, leaf_array.shape[1]):
                if leaf_array[n, j] < 0:
                    break
                if (leaf_array[n, i], leaf_array[n, j]) in tried:
                    continue

                d = dist(data[leaf_array[n, i]], data[leaf_array[n, j]],
                         *dist_args)
                unchecked_heap_push(graph_heap, leaf_array[n, i], d,
                                    leaf_array[n, j], 1)
                unchecked_heap_push(graph_heap, leaf_array[n, j], d,
                                    leaf_array[n, i], 1)
                unchecked_heap_push(search_heap, leaf_array[n, i], d,
                                    leaf_array[n, j], 1)
                unchecked_heap_push(search_heap, leaf_array[n, j], d,
                                    leaf_array[n, i], 1)
                tried.add((leaf_array[n, i], leaf_array[n, j]))
                tried.add((leaf_array[n, j], leaf_array[n, i]))

    return graph_heap, search_heap
Exemplo n.º 3
0
def test_new_build_candidates():
    np.random.seed(42)
    N = 100
    D = 128
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)
    n_vertices = data.shape[0]

    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    pynndescent_.init_random(
        n_neighbors,
        data,
        current_graph,
        dist,
        dist_args,
        new_rng_state(),
        seed_per_row=True,
    )
    new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        rng_state=new_rng_state(),
        seed_per_row=True,
    )

    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    pynndescent_.init_random(
        n_neighbors,
        data,
        current_graph,
        dist,
        dist_args,
        new_rng_state(),
        seed_per_row=True,
    )
    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    (
        new_candidate_neighbors_threaded,
        old_candidate_neighbors_threaded,
    ) = threaded.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        chunk_size=chunk_size,
        rng_state=new_rng_state(),
        parallel=parallel,
        seed_per_row=True,
    )

    assert_allclose(new_candidate_neighbors_threaded, new_candidate_neighbors)
    assert_allclose(old_candidate_neighbors_threaded, old_candidate_neighbors)
Exemplo n.º 4
0
def search(
    query_inds,
    query_indptr,
    query_data,
    k,
    inds,
    indptr,
    data,
    forest,
    search_indptr,
    search_indices,
    epsilon,
    n_neighbors,
    tried,
    sparse_dist,
    rng_state,
):

    n_query_points = query_indptr.shape[0] - 1

    result = make_heap(n_query_points, k)
    for i in range(n_query_points):
        tried[:] = 0
        current_query_inds = query_inds[query_indptr[i] : query_indptr[i + 1]]
        current_query_data = query_data[query_indptr[i] : query_indptr[i + 1]]

        heap_priorities, heap_indices = search_init(
            current_query_inds,
            current_query_data,
            k,
            inds,
            indptr,
            data,
            forest,
            n_neighbors,
            tried,
            sparse_dist,
            rng_state,
        )
        heap_priorities, heap_indices = search_from_init(
            current_query_inds,
            current_query_data,
            inds,
            indptr,
            data,
            search_indptr,
            search_indices,
            heap_priorities,
            heap_indices,
            epsilon,
            tried,
            sparse_dist,
        )

        result[0][i] = heap_indices
        result[1][i] = heap_priorities

    return result
Exemplo n.º 5
0
def initialise_search(forest, data, query_points, n_neighbors,
                      init_from_random, init_from_tree, rng_state):
    results = make_heap(query_points.shape[0], n_neighbors)
    init_from_random(n_neighbors, data, query_points, results, rng_state)
    if forest is not None:
        for tree in forest:
            init_from_tree(tree, data, query_points, results, rng_state)

    return results
Exemplo n.º 6
0
def nn_descent(
    inds,
    indptr,
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=sparse_euclidean,
    n_iters=10,
    delta=0.001,
    rp_tree_init=True,
    leaf_array=None,
    low_memory=False,
    verbose=False,
    seed_per_row=False,
):

    n_samples = indptr.shape[0] - 1
    current_graph = make_heap(n_samples, n_neighbors)

    if rp_tree_init:
        init_rp_tree(inds, indptr, data, dist, current_graph, leaf_array)

    init_random(n_neighbors, inds, indptr, data, current_graph, dist, rng_state)

    if low_memory:
        nn_descent_internal_low_memory_parallel(
            current_graph,
            inds,
            indptr,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            n_iters=n_iters,
            delta=delta,
            verbose=verbose,
            seed_per_row=seed_per_row,
        )
    else:
        nn_descent_internal_high_memory_parallel(
            current_graph,
            inds,
            indptr,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            n_iters=n_iters,
            delta=delta,
            verbose=verbose,
            seed_per_row=seed_per_row,
        )

    return deheap_sort(current_graph)
Exemplo n.º 7
0
def test_mark_candidate_results():

    np.random.seed(42)
    N = 100
    D = 128
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)
    n_vertices = data.shape[0]

    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    pynndescent_.init_random(
        n_neighbors,
        data,
        current_graph,
        dist,
        new_rng_state(),
        seed_per_row=True,
    )
    pynndescent_.nn_descent_internal_low_memory_parallel(current_graph,
                                                         data,
                                                         n_neighbors,
                                                         new_rng_state(),
                                                         n_iters=2,
                                                         seed_per_row=True)
    current_graph_threaded = utils.Heap(
        current_graph[0].copy(),
        current_graph[1].copy(),
        current_graph[2].copy(),
    )
    new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        rng_state=new_rng_state(),
        seed_per_row=True,
    )

    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    (
        new_candidate_neighbors_threaded,
        old_candidate_neighbors_threaded,
    ) = threaded.new_build_candidates(
        current_graph_threaded,
        n_vertices,
        n_neighbors,
        max_candidates,
        chunk_size=chunk_size,
        rng_state=new_rng_state(),
        parallel=parallel,
        seed_per_row=True,
    )

    assert_allclose(current_graph_threaded, current_graph)
Exemplo n.º 8
0
def test_init_rp_tree():

    # Use more graph_data than the other tests since otherwise init_rp_tree has nothing to do
    np.random.seed(42)
    N = 100
    D = 128
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)

    rng_state = new_rng_state()
    random_state = check_random_state(42)
    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    _rp_forest = make_forest(
        data,
        n_neighbors,
        n_trees=8,
        leaf_size=None,
        rng_state=rng_state,
        random_state=random_state,
    )
    leaf_array = rptree_leaf_array(_rp_forest)
    pynndescent_.init_rp_tree(data, dist, current_graph, leaf_array)

    rng_state = new_rng_state()
    random_state = check_random_state(42)
    current_graph_threaded = utils.make_heap(data.shape[0], n_neighbors)
    _rp_forest = make_forest(
        data,
        n_neighbors,
        n_trees=8,
        leaf_size=None,
        rng_state=rng_state,
        random_state=random_state,
    )
    leaf_array = rptree_leaf_array(_rp_forest)
    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    threaded.init_rp_tree(data, dist, current_graph_threaded, leaf_array,
                          chunk_size, parallel)

    assert_allclose(current_graph_threaded, current_graph)
Exemplo n.º 9
0
def init_current_graph(
    data, dist, dist_args, n_neighbors, rng_state, seed_per_row=False
):
    current_graph = make_heap(data.shape[0], n_neighbors)
    for i in range(data.shape[0]):
        if seed_per_row:
            seed(rng_state, i)
        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
        for j in range(indices.shape[0]):
            d = dist(data[i], data[indices[j]], *dist_args)
            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
    return current_graph
Exemplo n.º 10
0
def sparse_initialise_search(
    forest,
    inds,
    indptr,
    data,
    query_inds,
    query_indptr,
    query_data,
    n_neighbors,
    rng_state,
    sparse_dist,
    dist_args,
):
    results = make_heap(query_indptr.shape[0] - 1, n_neighbors)
    sparse_init_from_random(
        n_neighbors,
        inds,
        indptr,
        data,
        query_inds,
        query_indptr,
        query_data,
        results,
        rng_state,
        sparse_dist,
        dist_args,
    )
    if forest is not None:
        for tree in forest:
            sparse_init_from_tree(
                tree,
                inds,
                indptr,
                data,
                query_inds,
                query_indptr,
                query_data,
                results,
                rng_state,
                sparse_dist,
                dist_args,
            )

    return results
Exemplo n.º 11
0
    def nn_descent(
        inds,
        indptr,
        data,
        n_vertices,
        n_neighbors,
        rng_state,
        max_candidates=50,
        n_iters=10,
        delta=0.001,
        rho=0.5,
        rp_tree_init=True,
        leaf_array=None,
        verbose=False,
    ):
        current_graph = make_heap(n_vertices, n_neighbors)
        for i in range(n_vertices):
            indices = rejection_sample(n_neighbors, n_vertices, rng_state)
            for j in range(indices.shape[0]):

                from_inds = inds[indptr[i]:indptr[i + 1]]
                from_data = data[indptr[i]:indptr[i + 1]]

                to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]]
                to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]]

                d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                *dist_args)

                heap_push(current_graph, i, d, indices[j], 1)
                heap_push(current_graph, indices[j], d, i, 1)

        if rp_tree_init:
            for n in range(leaf_array.shape[0]):
                for i in range(leaf_array.shape[1]):
                    if leaf_array[n, i] < 0:
                        break
                    for j in range(i + 1, leaf_array.shape[1]):
                        if leaf_array[n, j] < 0:
                            break

                        from_inds = inds[indptr[leaf_array[
                            n, i]]:indptr[leaf_array[n, i] + 1]]
                        from_data = data[indptr[leaf_array[
                            n, i]]:indptr[leaf_array[n, i] + 1]]

                        to_inds = inds[indptr[leaf_array[
                            n, j]]:indptr[leaf_array[n, j] + 1]]
                        to_data = data[indptr[leaf_array[
                            n, j]]:indptr[leaf_array[n, j] + 1]]

                        d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                        *dist_args)

                        heap_push(current_graph, leaf_array[n, i], d,
                                  leaf_array[n, j], 1)
                        heap_push(current_graph, leaf_array[n, j], d,
                                  leaf_array[n, i], 1)

        for n in range(n_iters):
            if verbose:
                print("\t", n, " / ", n_iters)

            candidate_neighbors = build_candidates(current_graph, n_vertices,
                                                   n_neighbors, max_candidates,
                                                   rng_state)

            c = 0
            for i in range(n_vertices):
                for j in range(max_candidates):
                    p = int(candidate_neighbors[0, i, j])
                    if p < 0 or tau_rand(rng_state) < rho:
                        continue
                    for k in range(max_candidates):
                        q = int(candidate_neighbors[0, i, k])
                        if (q < 0 or not candidate_neighbors[2, i, j]
                                and not candidate_neighbors[2, i, k]):
                            continue

                        from_inds = inds[indptr[p]:indptr[p + 1]]
                        from_data = data[indptr[p]:indptr[p + 1]]

                        to_inds = inds[indptr[q]:indptr[q + 1]]
                        to_data = data[indptr[q]:indptr[q + 1]]

                        d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                        *dist_args)

                        c += heap_push(current_graph, p, d, q, 1)
                        c += heap_push(current_graph, q, d, p, 1)

            if c <= delta * n_neighbors * n_vertices:
                break

        return deheap_sort(current_graph)
Exemplo n.º 12
0
def sparse_nn_descent(
    inds,
    indptr,
    data,
    n_vertices,
    n_neighbors,
    rng_state,
    max_candidates=50,
    sparse_dist=sparse_euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    rp_tree_init=True,
    leaf_array=None,
    verbose=False,
):

    tried = set([(-1, -1)])

    current_graph = make_heap(n_vertices, n_neighbors)
    for i in range(n_vertices):
        indices = rejection_sample(n_neighbors, n_vertices, rng_state)
        for j in range(indices.shape[0]):

            from_inds = inds[indptr[i]:indptr[i + 1]]
            from_data = data[indptr[i]:indptr[i + 1]]

            to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]]
            to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]]

            d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args)

            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        sparse_init_rp_tree(
            inds,
            indptr,
            data,
            sparse_dist,
            dist_args,
            current_graph,
            leaf_array,
            tried=tried,
        )

    for n in range(n_iters):
        if verbose:
            print("\t", n, " / ", n_iters)

        (new_candidate_neighbors,
         old_candidate_neighbors) = new_build_candidates(
             current_graph,
             n_vertices,
             n_neighbors,
             max_candidates,
             rng_state,
             rho,
             False,
         )

        c = 0
        for i in range(n_vertices):
            for j in range(max_candidates):
                p = int(new_candidate_neighbors[0, i, j])
                if p < 0:
                    continue
                for k in range(j, max_candidates):
                    q = int(new_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    from_inds = inds[indptr[p]:indptr[p + 1]]
                    from_data = data[indptr[p]:indptr[p + 1]]

                    to_inds = inds[indptr[q]:indptr[q + 1]]
                    to_data = data[indptr[q]:indptr[q + 1]]

                    d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                    *dist_args)

                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

                for k in range(max_candidates):
                    q = int(old_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    from_inds = inds[indptr[p]:indptr[p + 1]]
                    from_data = data[indptr[p]:indptr[p + 1]]

                    to_inds = inds[indptr[q]:indptr[q + 1]]
                    to_data = data[indptr[q]:indptr[q + 1]]

                    d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                    *dist_args)

                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

        if c <= delta * n_neighbors * n_vertices:
            break

    return deheap_sort(current_graph)
Exemplo n.º 13
0
def nn_descent(
    inds,
    indptr,
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=sparse_euclidean,
    n_iters=10,
    delta=0.001,
    init_graph=EMPTY_GRAPH,
    rp_tree_init=True,
    leaf_array=None,
    low_memory=False,
    verbose=False,
):

    n_samples = indptr.shape[0] - 1

    if init_graph[0].shape[0] == 1:  # EMPTY_GRAPH
        current_graph = make_heap(n_samples, n_neighbors)

        if rp_tree_init:
            init_rp_tree(inds, indptr, data, dist, current_graph, leaf_array)

        init_random(n_neighbors, inds, indptr, data, current_graph, dist, rng_state)
    elif init_graph[0].shape[0] == n_samples and init_graph[0].shape[1] == n_neighbors:
        current_graph = init_graph
    else:
        raise ValueError("Invalid initial graph specified!")

    if low_memory:
        nn_descent_internal_low_memory_parallel(
            current_graph,
            inds,
            indptr,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            n_iters=n_iters,
            delta=delta,
            verbose=verbose,
        )
    else:
        nn_descent_internal_high_memory_parallel(
            current_graph,
            inds,
            indptr,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            n_iters=n_iters,
            delta=delta,
            verbose=verbose,
        )

    return deheap_sort(current_graph[0], current_graph[1])
Exemplo n.º 14
0
from pynndescent.utils import (
    tau_rand_int,
    make_heap,
    new_build_candidates,
    deheap_sort,
    checked_flagged_heap_push,
    apply_graph_updates_high_memory,
    apply_graph_updates_low_memory,
)

from pynndescent.sparse import sparse_euclidean

locale.setlocale(locale.LC_NUMERIC, "C")

EMPTY_GRAPH = make_heap(1, 1)


@numba.njit(parallel=True, cache=False)
def generate_leaf_updates(leaf_block, dist_thresholds, inds, indptr, data, dist):

    updates = [[(-1, -1, np.inf)] for i in range(leaf_block.shape[0])]

    for n in numba.prange(leaf_block.shape[0]):
        for i in range(leaf_block.shape[1]):
            p = leaf_block[n, i]
            if p < 0:
                break

            for j in range(i + 1, leaf_block.shape[1]):
                q = leaf_block[n, j]
Exemplo n.º 15
0
def nn_descent(data,
               n_neighbors,
               rng_state,
               max_candidates=50,
               dist=dist.euclidean,
               dist_args=(),
               n_iters=10,
               delta=0.001,
               rho=0.5,
               rp_tree_init=True,
               leaf_array=None,
               verbose=False):
    n_vertices = data.shape[0]

    current_graph = make_heap(data.shape[0], n_neighbors)
    for i in range(data.shape[0]):
        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
        for j in range(indices.shape[0]):
            d = dist(data[i], data[indices[j]], *dist_args)
            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)

    if rp_tree_init:
        for n in range(leaf_array.shape[0]):
            tried = set([(-1, -1)])
            for i in range(leaf_array.shape[1]):
                if leaf_array[n, i] < 0:
                    break
                for j in range(i + 1, leaf_array.shape[1]):
                    if leaf_array[n, j] < 0:
                        break
                    if (leaf_array[n, i], leaf_array[n, j]) in tried:
                        continue
                    d = dist(data[leaf_array[n, i]], data[leaf_array[n, j]],
                             *dist_args)
                    heap_push(current_graph, leaf_array[n, i], d,
                              leaf_array[n, j], 1)
                    heap_push(current_graph, leaf_array[n, j], d,
                              leaf_array[n, i], 1)
                    tried.add((leaf_array[n, i], leaf_array[n, j]))
                    tried.add((leaf_array[n, j], leaf_array[n, i]))

    for n in range(n_iters):

        (new_candidate_neighbors, old_candidate_neighbors) = build_candidates(
            current_graph, n_vertices, n_neighbors, max_candidates, rng_state,
            rho)

        c = 0
        for i in range(n_vertices):
            for j in range(max_candidates):
                p = int(new_candidate_neighbors[0, i, j])
                if p < 0:
                    continue
                for k in range(j, max_candidates):
                    q = int(new_candidate_neighbors[0, i, k])
                    if q < 0:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += heap_push(current_graph, p, d, q, 1)
                    c += heap_push(current_graph, q, d, p, 1)

                for k in range(max_candidates):
                    q = int(old_candidate_neighbors[0, i, k])
                    if q < 0:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += heap_push(current_graph, p, d, q, 1)
                    c += heap_push(current_graph, q, d, p, 1)

        if c <= delta * n_neighbors * data.shape[0]:
            break

    return deheap_sort(current_graph)
Exemplo n.º 16
0
def new_build_candidates(
    current_graph,
    n_vertices,
    n_neighbors,
    max_candidates,
    chunk_size,
    rng_state,
    parallel,
    seed_per_row=False,
):

    n_tasks = int(math.ceil(float(n_vertices) / chunk_size))

    new_candidate_neighbors = make_heap(n_vertices, max_candidates)
    old_candidate_neighbors = make_heap(n_vertices, max_candidates)

    # store the updates in an array
    max_heap_update_count = chunk_size * n_neighbors * 2
    heap_updates = np.zeros((n_tasks, max_heap_update_count, 4),
                            dtype=np.float32)
    heap_update_counts = np.zeros((n_tasks, ), dtype=np.int64)
    rng_state_threads = per_thread_rng_state(n_tasks, rng_state)

    def candidates_map(index):
        rows = chunk_rows(chunk_size, index, n_vertices)
        return (
            index,
            candidates_map_jit(
                rows,
                n_neighbors,
                current_graph,
                heap_updates[index],
                offset=0,
                rng_state=rng_state_threads[index],
                seed_per_row=seed_per_row,
            ),
        )

    def candidates_reduce(index):
        return candidates_reduce_jit(
            n_tasks,
            current_graph,
            new_candidate_neighbors,
            old_candidate_neighbors,
            heap_updates,
            offsets,
            index,
        )

    # run map functions
    for index, count in parallel(parallel_calls(candidates_map, n_tasks)):
        heap_update_counts[index] = count

    # sort and chunk heap updates so they can be applied in the reduce
    max_count = heap_update_counts.max()
    offsets = np.zeros((n_tasks, max_count), dtype=np.int64)

    def shuffle(index):
        return shuffle_jit(heap_updates, heap_update_counts, offsets,
                           chunk_size, n_vertices, index)

    parallel(parallel_calls(shuffle, n_tasks))

    # then run reduce functions
    parallel(parallel_calls(candidates_reduce, n_tasks))

    def mark_candidate_results(index):
        rows = chunk_rows(chunk_size, index, n_vertices)
        return mark_candidate_results_map(rows, current_graph, n_neighbors,
                                          max_candidates,
                                          new_candidate_neighbors)

    # Now mark whether things were used correctly
    parallel(parallel_calls(mark_candidate_results, n_tasks))

    return new_candidate_neighbors, old_candidate_neighbors
Exemplo n.º 17
0
def nn_descent(
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=dist.euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    rp_tree_init=True,
    leaf_array=None,
    verbose=False,
    seed_per_row=False,
):
    n_vertices = data.shape[0]
    tried = set([(-1, -1)])

    current_graph = make_heap(data.shape[0], n_neighbors)
    for i in range(data.shape[0]):
        if seed_per_row:
            seed(rng_state, i)
        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
        for j in range(indices.shape[0]):
            d = dist(data[i], data[indices[j]], *dist_args)
            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        init_rp_tree(data,
                     dist,
                     dist_args,
                     current_graph,
                     leaf_array,
                     tried=tried)

    for n in range(n_iters):
        if verbose:
            print("\t", n, " / ", n_iters)

        (new_candidate_neighbors,
         old_candidate_neighbors) = new_build_candidates(
             current_graph,
             n_vertices,
             n_neighbors,
             max_candidates,
             rng_state,
             rho,
             seed_per_row,
         )

        c = 0
        for i in range(n_vertices):
            for j in range(max_candidates):
                p = int(new_candidate_neighbors[0, i, j])
                if p < 0:
                    continue
                for k in range(j, max_candidates):
                    q = int(new_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

                for k in range(max_candidates):
                    q = int(old_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

        if c <= delta * n_neighbors * data.shape[0]:
            break

    return deheap_sort(current_graph)
Exemplo n.º 18
0
    def custom_search_closure(query_points, candidate_indices, k, epsilon,
                              visited):
        result = make_heap(query_points.shape[0], k)
        distance_scale = 1.0 + epsilon

        for i in range(query_points.shape[0]):
            visited[:] = 0
            if dist == alternative_dot or dist == alternative_cosine:
                norm = np.sqrt((query_points[i]**2).sum())
                if norm > 0.0:
                    current_query = query_points[i] / norm
                else:
                    continue
            else:
                current_query = query_points[i]

            heap_priorities = result[1][i]
            heap_indices = result[0][i]
            seed_set = [(np.float32(np.inf), np.int32(-1)) for j in range(0)]

            ############ Init ################
            n_initial_points = candidate_indices.shape[0]

            for j in range(n_initial_points):
                candidate = np.int32(candidate_indices[j])
                d = dist(data[candidate], current_query)
                # indices are guaranteed different
                simple_heap_push(heap_priorities, heap_indices, d, candidate)
                heapq.heappush(seed_set, (d, candidate))
                mark_visited(visited, candidate)

            ############ Search ##############
            distance_bound = distance_scale * heap_priorities[0]

            # Find smallest seed point
            d_vertex, vertex = heapq.heappop(seed_set)

            while d_vertex < distance_bound:

                for j in range(indptr[vertex], indptr[vertex + 1]):

                    candidate = indices[j]

                    if has_been_visited(visited, candidate) == 0:
                        mark_visited(visited, candidate)

                        d = dist(data[candidate], current_query)

                        if d < distance_bound:
                            simple_heap_push(heap_priorities, heap_indices, d,
                                             candidate)
                            heapq.heappush(seed_set, (d, candidate))
                            # Update bound
                            distance_bound = distance_scale * heap_priorities[0]

                # find new smallest seed point
                if len(seed_set) == 0:
                    break
                else:
                    d_vertex, vertex = heapq.heappop(seed_set)

        return result
Exemplo n.º 19
0
def nn_descent(
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=dist.euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    rp_tree_init=True,
    leaf_array=None,
    low_memory=False,
    verbose=False,
    seed_per_row=False,
):
    tried = set([(-1, -1)])

    current_graph = make_heap(data.shape[0], n_neighbors)
    for i in range(data.shape[0]):
        if seed_per_row:
            seed(rng_state, i)
        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
        for j in range(indices.shape[0]):
            d = dist(data[i], data[indices[j]], *dist_args)
            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        init_rp_tree(data,
                     dist,
                     dist_args,
                     current_graph,
                     leaf_array,
                     tried=tried)

    if low_memory:
        nn_descent_internal_low_memory(
            current_graph,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            dist_args=dist_args,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
            seed_per_row=seed_per_row,
        )
    else:
        nn_descent_internal_high_memory(
            current_graph,
            data,
            n_neighbors,
            rng_state,
            tried,
            max_candidates=max_candidates,
            dist=dist,
            dist_args=dist_args,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
            seed_per_row=seed_per_row,
        )

    return deheap_sort(current_graph)
Exemplo n.º 20
0
def sparse_nn_descent(
    inds,
    indptr,
    data,
    n_vertices,
    n_neighbors,
    rng_state,
    max_candidates=50,
    sparse_dist=sparse_euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    low_memory=False,
    rp_tree_init=True,
    leaf_array=None,
    verbose=False,
):

    tried = set([(-1, -1)])

    current_graph = make_heap(n_vertices, n_neighbors)
    for i in range(n_vertices):
        indices = rejection_sample(n_neighbors, n_vertices, rng_state)
        for j in range(indices.shape[0]):

            from_inds = inds[indptr[i]:indptr[i + 1]]
            from_data = data[indptr[i]:indptr[i + 1]]

            to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]]
            to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]]

            d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args)

            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        sparse_init_rp_tree(
            inds,
            indptr,
            data,
            sparse_dist,
            dist_args,
            current_graph,
            leaf_array,
            tried=tried,
        )

    if low_memory:
        sparse_nn_descent_internal_low_memory(
            current_graph,
            inds,
            indptr,
            data,
            n_vertices,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            sparse_dist=sparse_dist,
            dist_args=dist_args,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
        )
    else:
        sparse_nn_descent_internal_high_memory(
            current_graph,
            inds,
            indptr,
            data,
            n_vertices,
            n_neighbors,
            rng_state,
            tried,
            max_candidates=max_candidates,
            sparse_dist=sparse_dist,
            dist_args=dist_args,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
        )

    return deheap_sort(current_graph)
Exemplo n.º 21
0
def nn_descent(
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=dst.euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rp_tree_init=False,
    leaf_array=None,
    verbose=False,
    n_jobs=None,
    seed_per_row=False,
):

    if rng_state is None:
        rng_state = new_rng_state()

    with joblib.Parallel(prefer="threads", n_jobs=n_jobs) as parallel:

        n_vertices = data.shape[0]
        n_tasks = effective_n_jobs_with_context(n_jobs)
        chunk_size = int(math.ceil(n_vertices / n_tasks))

        current_graph = make_heap(data.shape[0], n_neighbors)

        if rp_tree_init:
            init_rp_tree(data, dist, dist_args, current_graph, leaf_array,
                         chunk_size, parallel)

        init_random(
            current_graph,
            data,
            dist,
            dist_args,
            n_neighbors,
            chunk_size,
            rng_state,
            parallel,
            seed_per_row=seed_per_row,
        )

        # store the updates in an array
        # note that the factor here is `n_neighbors * n_neighbors`, not `max_candidates * max_candidates`
        # since no more than `n_neighbors` candidates are added for each row
        max_heap_update_count = chunk_size * n_neighbors * n_neighbors * 4
        heap_updates = np.zeros((n_tasks, max_heap_update_count, 4),
                                dtype=np.float32)
        heap_update_counts = np.zeros((n_tasks, ), dtype=np.int64)

        for n in range(n_iters):
            if verbose:
                print("\t", n, " / ", n_iters)

            (new_candidate_neighbors,
             old_candidate_neighbors) = new_build_candidates(
                 current_graph,
                 n_vertices,
                 n_neighbors,
                 max_candidates,
                 chunk_size,
                 rng_state,
                 parallel,
                 seed_per_row=seed_per_row,
             )

            def nn_descent_map(index):
                rows = chunk_rows(chunk_size, index, n_vertices)
                return (
                    index,
                    nn_descent_map_jit(
                        rows,
                        max_candidates,
                        data,
                        new_candidate_neighbors,
                        old_candidate_neighbors,
                        heap_updates[index],
                        offset=0,
                        dist=dist,
                        dist_args=dist_args,
                    ),
                )

            def nn_decent_reduce(index):
                return nn_decent_reduce_jit(n_tasks, current_graph,
                                            heap_updates, offsets, index)

            # run map functions
            for index, count in parallel(
                    parallel_calls(nn_descent_map, n_tasks)):
                heap_update_counts[index] = count

            # sort and chunk heap updates so they can be applied in the reduce
            max_count = heap_update_counts.max()
            offsets = np.zeros((n_tasks, max_count), dtype=np.int64)

            def shuffle(index):
                return shuffle_jit(
                    heap_updates,
                    heap_update_counts,
                    offsets,
                    chunk_size,
                    n_vertices,
                    index,
                )

            parallel(parallel_calls(shuffle, n_tasks))

            # then run reduce functions
            c = 0
            for c_part in parallel(parallel_calls(nn_decent_reduce, n_tasks)):
                c += c_part

            if c <= delta * n_neighbors * data.shape[0]:
                break

        def deheap_sort_map(index):
            rows = chunk_rows(chunk_size, index, n_vertices)
            return index, deheap_sort_map_jit(rows, current_graph)

        parallel(parallel_calls(deheap_sort_map, n_tasks))
        return current_graph[0].astype(np.int64), current_graph[1]
Exemplo n.º 22
0
def init_current_graph(
    data,
    dist,
    dist_args,
    n_neighbors,
    chunk_size,
    rng_state,
    parallel,
    seed_per_row=False,
):

    n_vertices = data.shape[0]
    n_tasks = int(math.ceil(float(n_vertices) / chunk_size))

    current_graph = make_heap(n_vertices, n_neighbors)

    # store the updates in an array
    max_heap_update_count = chunk_size * n_neighbors * 2
    heap_updates = np.zeros((n_tasks, max_heap_update_count, 4),
                            dtype=np.float32)
    heap_update_counts = np.zeros((n_tasks, ), dtype=np.int64)
    rng_state_threads = per_thread_rng_state(n_tasks, rng_state)

    def current_graph_map(index):
        rows = chunk_rows(chunk_size, index, n_vertices)
        return (
            index,
            current_graph_map_jit(
                rows,
                n_vertices,
                n_neighbors,
                data,
                heap_updates[index],
                rng_state_threads[index],
                seed_per_row=seed_per_row,
                dist=dist,
                dist_args=dist_args,
            ),
        )

    def current_graph_reduce(index):
        return current_graph_reduce_jit(n_tasks, current_graph, heap_updates,
                                        offsets, index)

    # run map functions
    for index, count in parallel(parallel_calls(current_graph_map, n_tasks)):
        heap_update_counts[index] = count

    # sort and chunk heap updates so they can be applied in the reduce
    max_count = heap_update_counts.max()
    offsets = np.zeros((n_tasks, max_count), dtype=np.int64)

    def shuffle(index):
        return shuffle_jit(heap_updates, heap_update_counts, offsets,
                           chunk_size, n_vertices, index)

    parallel(parallel_calls(shuffle, n_tasks))

    # then run reduce functions
    parallel(parallel_calls(current_graph_reduce, n_tasks))

    return current_graph