示例#1
0
def sparse_init_rp_tree(inds, indptr, data, dist, dist_args, current_graph,
                        leaf_array, chunk_size, parallel):
    n_vertices = data.shape[0]
    n_tasks = int(math.ceil(float(n_vertices) / chunk_size))

    # store the updates in an array
    max_heap_update_count = chunk_size * leaf_array.shape[
        1] * leaf_array.shape[1] * 2
    heap_updates = np.zeros((n_tasks, max_heap_update_count, 4),
                            dtype=np.float32)
    heap_update_counts = np.zeros((n_tasks, ), dtype=np.int64)

    def init_rp_tree_map(index):
        rows = chunk_rows(chunk_size, index, n_vertices)
        return (
            index,
            sparse_init_rp_tree_map_jit(
                rows,
                leaf_array,
                inds,
                indptr,
                data,
                heap_updates[index],
                dist,
                dist_args,
            ),
        )

    def init_rp_tree_reduce(index):
        return init_rp_tree_reduce_jit(n_tasks, current_graph, heap_updates,
                                       offsets, index)

    # run map functions
    for index, count in parallel(parallel_calls(init_rp_tree_map, n_tasks)):
        heap_update_counts[index] = count

    # sort and chunk heap updates so they can be applied in the reduce
    max_count = heap_update_counts.max()
    offsets = np.zeros((n_tasks, max_count), dtype=np.int64)

    def shuffle(index):
        return shuffle_jit(heap_updates, heap_update_counts, offsets,
                           chunk_size, n_vertices, index)

    parallel(parallel_calls(shuffle, n_tasks))

    # then run reduce functions
    parallel(parallel_calls(init_rp_tree_reduce, n_tasks))
示例#2
0
def sparse_init_random(
    current_graph,
    inds,
    indptr,
    data,
    dist,
    dist_args,
    n_neighbors,
    chunk_size,
    rng_state,
    parallel,
    seed_per_row=False,
):

    n_vertices = data.shape[0]
    n_tasks = int(math.ceil(float(n_vertices) / chunk_size))

    # store the updates in an array
    max_heap_update_count = chunk_size * n_neighbors * 2
    heap_updates = np.zeros((n_tasks, max_heap_update_count, 4),
                            dtype=np.float32)
    heap_update_counts = np.zeros((n_tasks, ), dtype=np.int64)
    rng_state_threads = per_thread_rng_state(n_tasks, rng_state)

    def current_graph_map(index):
        rows = chunk_rows(chunk_size, index, n_vertices)
        return (
            index,
            sparse_current_graph_map_jit(
                current_graph,
                rows,
                n_neighbors,
                inds,
                indptr,
                data,
                rng_state_threads[index],
                seed_per_row=seed_per_row,
                sparse_dist=dist,
                dist_args=dist_args,
            ),
        )

    # run map functions
    for index, status in parallel(parallel_calls(current_graph_map, n_tasks)):
        if status is False:
            raise ValueError("Failed in random initialization")

    return
示例#3
0
def sparse_nn_descent(
    inds,
    indptr,
    data,
    n_vertices,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=sparse.sparse_euclidean,
    n_iters=10,
    delta=0.001,
    rp_tree_init=False,
    leaf_array=None,
    verbose=False,
    n_jobs=None,
    seed_per_row=False,
):

    if rng_state is None:
        rng_state = new_rng_state()

    with joblib.Parallel(prefer="threads", n_jobs=n_jobs) as parallel:

        n_tasks = effective_n_jobs_with_context(n_jobs)
        chunk_size = int(math.ceil(n_vertices / n_tasks))

        current_graph = make_heap(n_vertices, n_neighbors)

        if rp_tree_init:
            sparse_init_rp_tree(
                inds,
                indptr,
                data,
                dist,
                current_graph,
                leaf_array,
                chunk_size,
                parallel,
            )

        sparse_init_random(
            current_graph,
            inds,
            indptr,
            data,
            dist,
            n_neighbors,
            chunk_size,
            rng_state,
            parallel,
            seed_per_row=seed_per_row,
        )

        # store the updates in an array
        # note that the factor here is `n_neighbors * n_neighbors`, not `max_candidates * max_candidates`
        # since no more than `n_neighbors` candidates are added for each row
        max_heap_update_count = chunk_size * n_neighbors * n_neighbors * 4
        heap_updates = np.zeros((n_tasks, max_heap_update_count, 4),
                                dtype=np.float32)
        heap_update_counts = np.zeros((n_tasks, ), dtype=np.int64)

        for n in range(n_iters):
            if verbose:
                print("\t", n, " / ", n_iters)

            (new_candidate_neighbors,
             old_candidate_neighbors) = new_build_candidates(
                 current_graph,
                 n_vertices,
                 n_neighbors,
                 max_candidates,
                 chunk_size,
                 rng_state,
                 parallel,
                 seed_per_row=seed_per_row,
             )

            def nn_descent_map(index):
                rows = chunk_rows(chunk_size, index, n_vertices)
                return (
                    index,
                    sparse_nn_descent_map_jit(
                        rows,
                        max_candidates,
                        inds,
                        indptr,
                        data,
                        new_candidate_neighbors,
                        old_candidate_neighbors,
                        heap_updates[index],
                        offset=0,
                        sparse_dist=dist,
                    ),
                )

            def nn_decent_reduce(index):
                return nn_decent_reduce_jit(n_tasks, current_graph,
                                            heap_updates, offsets, index)

            # run map functions
            for index, count in parallel(
                    parallel_calls(nn_descent_map, n_tasks)):
                heap_update_counts[index] = count

            # sort and chunk heap updates so they can be applied in the reduce
            max_count = heap_update_counts.max()
            offsets = np.zeros((n_tasks, max_count), dtype=np.int64)

            def shuffle(index):
                return shuffle_jit(
                    heap_updates,
                    heap_update_counts,
                    offsets,
                    chunk_size,
                    n_vertices,
                    index,
                )

            parallel(parallel_calls(shuffle, n_tasks))

            # then run reduce functions
            c = 0
            for c_part in parallel(parallel_calls(nn_decent_reduce, n_tasks)):
                c += c_part

            if c <= delta * n_neighbors * data.shape[0]:
                break

        def deheap_sort_map(index):
            rows = chunk_rows(chunk_size, index, n_vertices)
            return index, deheap_sort_map_jit(rows, current_graph)

        parallel(parallel_calls(deheap_sort_map, n_tasks))
        return current_graph[0].astype(np.int64), current_graph[1]
示例#4
0
def sparse_init_current_graph(
    inds,
    indptr,
    data,
    dist,
    dist_args,
    n_neighbors,
    chunk_size,
    rng_state,
    parallel,
    seed_per_row=False,
):

    n_vertices = data.shape[0]
    n_tasks = int(math.ceil(float(n_vertices) / chunk_size))

    current_graph = make_heap(n_vertices, n_neighbors)

    # store the updates in an array
    max_heap_update_count = chunk_size * n_neighbors * 2
    heap_updates = np.zeros((n_tasks, max_heap_update_count, 4),
                            dtype=np.float32)
    heap_update_counts = np.zeros((n_tasks, ), dtype=np.int64)
    rng_state_threads = per_thread_rng_state(n_tasks, rng_state)

    def current_graph_map(index):
        rows = chunk_rows(chunk_size, index, n_vertices)
        return (
            index,
            sparse_current_graph_map_jit(
                rows,
                n_vertices,
                n_neighbors,
                inds,
                indptr,
                data,
                heap_updates[index],
                rng_state_threads[index],
                seed_per_row=seed_per_row,
                sparse_dist=dist,
                dist_args=dist_args,
            ),
        )

    def current_graph_reduce(index):
        return current_graph_reduce_jit(n_tasks, current_graph, heap_updates,
                                        offsets, index)

    # run map functions
    for index, count in parallel(parallel_calls(current_graph_map, n_tasks)):
        heap_update_counts[index] = count

    # sort and chunk heap updates so they can be applied in the reduce
    max_count = heap_update_counts.max()
    offsets = np.zeros((n_tasks, max_count), dtype=np.int64)

    def shuffle(index):
        return shuffle_jit(heap_updates, heap_update_counts, offsets,
                           chunk_size, n_vertices, index)

    parallel(parallel_calls(shuffle, n_tasks))

    # then run reduce functions
    parallel(parallel_calls(current_graph_reduce, n_tasks))

    return current_graph