예제 #1
0
def test_effective_n_jobs_with_context():
    assert_equal(threaded.effective_n_jobs_with_context(), 1,
                 "Default to 1 job")
    assert_equal(
        threaded.effective_n_jobs_with_context(-1),
        joblib.cpu_count(),
        "Use all cores with num_jobs=-1",
    )
    assert_equal(threaded.effective_n_jobs_with_context(2), 2,
                 "Use n_jobs if specified")
    with joblib.parallel_backend("threading"):
        assert_equal(
            threaded.effective_n_jobs_with_context(),
            joblib.cpu_count(),
            "Use all cores with context manager",
        )
    with joblib.parallel_backend("threading", n_jobs=3):
        assert_equal(
            threaded.effective_n_jobs_with_context(),
            3,
            "Use n_jobs from context manager",
        )
    with joblib.parallel_backend("threading", n_jobs=3):
        assert_equal(
            threaded.effective_n_jobs_with_context(2),
            2,
            "Use n_jobs specified rather than from context manager",
        )
예제 #2
0
    def __init__(
        self,
        data,
        metric="euclidean",
        metric_kwds=None,
        n_neighbors=15,
        n_trees=None,
        leaf_size=None,
        pruning_level=0,
        tree_init=True,
        random_state=np.random,
        algorithm="standard",
        max_candidates=20,
        n_iters=None,
        delta=0.001,
        rho=0.5,
        n_jobs=None,
        seed_per_row=False,
        verbose=False,
    ):

        if n_trees is None:
            n_trees = 5 + int(round((data.shape[0])**0.5 / 20.0))
        if n_iters is None:
            n_iters = max(5, int(round(np.log2(data.shape[0]))))

        self.n_trees = n_trees
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwds = metric_kwds
        self.leaf_size = leaf_size
        self.prune_level = pruning_level
        self.max_candidates = max_candidates
        self.n_iters = n_iters
        self.delta = delta
        self.rho = rho
        self.dim = data.shape[1]
        self.verbose = verbose

        data = check_array(data, dtype=np.float32, accept_sparse="csr")
        self._raw_data = data

        if not tree_init or n_trees == 0:
            self.tree_init = False
        else:
            self.tree_init = True

        metric_kwds = metric_kwds or {}
        self._dist_args = tuple(metric_kwds.values())

        self.random_state = check_random_state(random_state)

        if callable(metric):
            self._distance_func = metric
        elif metric in dist.named_distances:
            self._distance_func = dist.named_distances[metric]
        else:
            raise ValueError("Metric is neither callable, " +
                             "nor a recognised string")

        if metric in ("cosine", "correlation", "dice", "jaccard"):
            self._angular_trees = True
        else:
            self._angular_trees = False

        self.rng_state = self.random_state.randint(INT32_MIN, INT32_MAX,
                                                   3).astype(np.int64)

        if self.tree_init:
            if verbose:
                print(ts(), "Building RP forest with", str(n_trees), "trees")
            self._rp_forest = make_forest(
                data,
                n_neighbors,
                n_trees,
                leaf_size,
                self.rng_state,
                self._angular_trees,
            )
            leaf_array = rptree_leaf_array(self._rp_forest)
        else:
            self._rp_forest = None
            leaf_array = np.array([[-1]])

        if threaded.effective_n_jobs_with_context(n_jobs) != 1:
            if algorithm != "standard":
                raise ValueError(
                    "Algorithm {} not supported in parallel mode".format(
                        algorithm))
            if isspmatrix_csr(self._raw_data):
                raise ValueError(
                    "Sparse input is not currently supported in parallel mode")
            if verbose:
                print(ts(), "parallel NN descent for", str(n_iters),
                      "iterations")

            if isspmatrix_csr(self._raw_data):
                # Sparse case
                self._is_sparse = True
                if metric in sparse.sparse_named_distances:
                    self._distance_func = sparse.sparse_named_distances[metric]
                    if metric in sparse.sparse_need_n_features:
                        metric_kwds["n_features"] = self._raw_data.shape[1]
                    self._dist_args = tuple(metric_kwds.values())
                else:
                    raise ValueError(
                        "Metric {} not supported for sparse data".format(
                            metric))
                self._neighbor_graph = sparse_threaded.sparse_nn_descent(
                    self._raw_data.indices,
                    self._raw_data.indptr,
                    self._raw_data.data,
                    self._raw_data.shape[0],
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    self._distance_func,
                    self._dist_args,
                    self.n_iters,
                    self.delta,
                    self.rho,
                    rp_tree_init=self.tree_init,
                    leaf_array=leaf_array,
                    verbose=verbose,
                    n_jobs=n_jobs,
                    seed_per_row=seed_per_row,
                )
            else:
                # Regular case
                self._is_sparse = False
                self._neighbor_graph = threaded.nn_descent(
                    self._raw_data,
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    self._distance_func,
                    self._dist_args,
                    self.n_iters,
                    self.delta,
                    self.rho,
                    rp_tree_init=self.tree_init,
                    leaf_array=leaf_array,
                    verbose=verbose,
                    n_jobs=n_jobs,
                    seed_per_row=seed_per_row,
                )
        elif algorithm == "standard" or leaf_array.shape[0] == 1:
            if isspmatrix_csr(self._raw_data):

                self._is_sparse = True

                if metric in sparse.sparse_named_distances:
                    self._distance_func = sparse.sparse_named_distances[metric]
                    if metric in sparse.sparse_need_n_features:
                        metric_kwds["n_features"] = self._raw_data.shape[1]
                    self._dist_args = tuple(metric_kwds.values())
                else:
                    raise ValueError(
                        "Metric {} not supported for sparse data".format(
                            metric))

                if verbose:
                    print(ts(), "metric NN descent for", str(n_iters),
                          "iterations")

                self._neighbor_graph = sparse_nnd.sparse_nn_descent(
                    self._raw_data.indices,
                    self._raw_data.indptr,
                    self._raw_data.data,
                    self._raw_data.shape[0],
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    sparse_dist=self._distance_func,
                    dist_args=self._dist_args,
                    n_iters=self.n_iters,
                    rp_tree_init=False,
                    leaf_array=leaf_array,
                    verbose=verbose,
                )

            else:

                self._is_sparse = False

                if verbose:
                    print(ts(), "NN descent for", str(n_iters), "iterations")

                self._neighbor_graph = nn_descent(
                    self._raw_data,
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    self._distance_func,
                    self._dist_args,
                    self.n_iters,
                    self.delta,
                    self.rho,
                    rp_tree_init=True,
                    leaf_array=leaf_array,
                    verbose=verbose,
                    seed_per_row=seed_per_row,
                )
        elif algorithm == "alternative":

            self._is_sparse = False

            if verbose:
                print(ts(), "Using alternative algorithm")

            graph_heap, search_heap = initialize_heaps(
                self._raw_data,
                self.n_neighbors,
                leaf_array,
                self._distance_func,
                self._dist_args,
            )
            graph = lil_matrix((data.shape[0], data.shape[0]))
            graph.rows, graph.data = deheap_sort(graph_heap)
            graph = graph.maximum(graph.transpose())
            self._neighbor_graph = deheap_sort(
                initialized_nnd_search(
                    self._raw_data,
                    graph.indptr,
                    graph.indices,
                    search_heap,
                    self._raw_data,
                    self._distance_func,
                    self._dist_args,
                ))
        else:
            raise ValueError("Unknown algorithm selected")

        if np.any(self._neighbor_graph[0] < 0):
            warn("Failed to correctly find n_neighbors for some samples."
                 "Results may be less than ideal. Try re-running with"
                 "different parameters.")
예제 #3
0
def sparse_nn_descent(
    inds,
    indptr,
    data,
    n_vertices,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=sparse.sparse_euclidean,
    n_iters=10,
    delta=0.001,
    rp_tree_init=False,
    leaf_array=None,
    verbose=False,
    n_jobs=None,
    seed_per_row=False,
):

    if rng_state is None:
        rng_state = new_rng_state()

    with joblib.Parallel(prefer="threads", n_jobs=n_jobs) as parallel:

        n_tasks = effective_n_jobs_with_context(n_jobs)
        chunk_size = int(math.ceil(n_vertices / n_tasks))

        current_graph = make_heap(n_vertices, n_neighbors)

        if rp_tree_init:
            sparse_init_rp_tree(
                inds,
                indptr,
                data,
                dist,
                current_graph,
                leaf_array,
                chunk_size,
                parallel,
            )

        sparse_init_random(
            current_graph,
            inds,
            indptr,
            data,
            dist,
            n_neighbors,
            chunk_size,
            rng_state,
            parallel,
            seed_per_row=seed_per_row,
        )

        # store the updates in an array
        # note that the factor here is `n_neighbors * n_neighbors`, not `max_candidates * max_candidates`
        # since no more than `n_neighbors` candidates are added for each row
        max_heap_update_count = chunk_size * n_neighbors * n_neighbors * 4
        heap_updates = np.zeros((n_tasks, max_heap_update_count, 4),
                                dtype=np.float32)
        heap_update_counts = np.zeros((n_tasks, ), dtype=np.int64)

        for n in range(n_iters):
            if verbose:
                print("\t", n, " / ", n_iters)

            (new_candidate_neighbors,
             old_candidate_neighbors) = new_build_candidates(
                 current_graph,
                 n_vertices,
                 n_neighbors,
                 max_candidates,
                 chunk_size,
                 rng_state,
                 parallel,
                 seed_per_row=seed_per_row,
             )

            def nn_descent_map(index):
                rows = chunk_rows(chunk_size, index, n_vertices)
                return (
                    index,
                    sparse_nn_descent_map_jit(
                        rows,
                        max_candidates,
                        inds,
                        indptr,
                        data,
                        new_candidate_neighbors,
                        old_candidate_neighbors,
                        heap_updates[index],
                        offset=0,
                        sparse_dist=dist,
                    ),
                )

            def nn_decent_reduce(index):
                return nn_decent_reduce_jit(n_tasks, current_graph,
                                            heap_updates, offsets, index)

            # run map functions
            for index, count in parallel(
                    parallel_calls(nn_descent_map, n_tasks)):
                heap_update_counts[index] = count

            # sort and chunk heap updates so they can be applied in the reduce
            max_count = heap_update_counts.max()
            offsets = np.zeros((n_tasks, max_count), dtype=np.int64)

            def shuffle(index):
                return shuffle_jit(
                    heap_updates,
                    heap_update_counts,
                    offsets,
                    chunk_size,
                    n_vertices,
                    index,
                )

            parallel(parallel_calls(shuffle, n_tasks))

            # then run reduce functions
            c = 0
            for c_part in parallel(parallel_calls(nn_decent_reduce, n_tasks)):
                c += c_part

            if c <= delta * n_neighbors * data.shape[0]:
                break

        def deheap_sort_map(index):
            rows = chunk_rows(chunk_size, index, n_vertices)
            return index, deheap_sort_map_jit(rows, current_graph)

        parallel(parallel_calls(deheap_sort_map, n_tasks))
        return current_graph[0].astype(np.int64), current_graph[1]