예제 #1
0
def test_init_rp_tree():

    # Use more data than the other tests since otherwise init_rp_tree has nothing to do
    np.random.seed(42)
    N = 100
    D = 128
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)

    rng_state = new_rng_state()
    current_graph = pynndescent_.init_current_graph(
        data, dist, dist_args, n_neighbors, rng_state=rng_state, seed_per_row=True
    )
    _rp_forest = make_forest(data, n_neighbors, n_trees=8, rng_state=rng_state)
    leaf_array = rptree_leaf_array(_rp_forest)
    pynndescent_.init_rp_tree(data, dist, dist_args, current_graph, leaf_array)

    rng_state = new_rng_state()
    current_graph_threaded = pynndescent_.init_current_graph(
        data, dist, dist_args, n_neighbors, rng_state=rng_state, seed_per_row=True
    )
    _rp_forest = make_forest(data, n_neighbors, n_trees=8, rng_state=rng_state)
    leaf_array = rptree_leaf_array(_rp_forest)
    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    threaded.init_rp_tree(
        data, dist, dist_args, current_graph_threaded, leaf_array, chunk_size, parallel
    )

    assert_allclose(current_graph_threaded, current_graph)
예제 #2
0
    def _init_pynndescent(self, distances):
        from pynndescent import NNDescent

        self._use_pynndescent = True

        first_col = np.arange(distances.shape[0])[:, None]
        init_indices = np.hstack((first_col, np.stack(distances.tolil().rows)))

        self._nnd_idx = NNDescent(
            data=self._rep,
            metric=self._metric,
            metric_kwds=self._metric_kwds,
            n_neighbors=self._n_neighbors,
            init_graph=init_indices,
            random_state=self._neigh_random_state,
        )

        # temporary hack for the broken forest storage
        from pynndescent.rp_trees import make_forest

        current_random_state = check_random_state(self._nnd_idx.random_state)
        self._nnd_idx._rp_forest = make_forest(
            self._nnd_idx._raw_data,
            self._nnd_idx.n_neighbors,
            self._nnd_idx.n_search_trees,
            self._nnd_idx.leaf_size,
            self._nnd_idx.rng_state,
            current_random_state,
            self._nnd_idx.n_jobs,
            self._nnd_idx._angular_trees,
        )
예제 #3
0
def test_init_rp_tree():

    # Use more graph_data than the other tests since otherwise init_rp_tree has nothing to do
    np.random.seed(42)
    N = 100
    D = 128
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)

    rng_state = new_rng_state()
    random_state = check_random_state(42)
    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    _rp_forest = make_forest(
        data,
        n_neighbors,
        n_trees=8,
        leaf_size=None,
        rng_state=rng_state,
        random_state=random_state,
    )
    leaf_array = rptree_leaf_array(_rp_forest)
    pynndescent_.init_rp_tree(data, dist, current_graph, leaf_array)

    rng_state = new_rng_state()
    random_state = check_random_state(42)
    current_graph_threaded = utils.make_heap(data.shape[0], n_neighbors)
    _rp_forest = make_forest(
        data,
        n_neighbors,
        n_trees=8,
        leaf_size=None,
        rng_state=rng_state,
        random_state=random_state,
    )
    leaf_array = rptree_leaf_array(_rp_forest)
    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    threaded.init_rp_tree(data, dist, current_graph_threaded, leaf_array,
                          chunk_size, parallel)

    assert_allclose(current_graph_threaded, current_graph)
예제 #4
0
    def __init__(
        self,
        data,
        metric="euclidean",
        metric_kwds=None,
        n_neighbors=15,
        n_trees=None,
        leaf_size=None,
        pruning_level=0,
        tree_init=True,
        random_state=np.random,
        algorithm="standard",
        max_candidates=20,
        n_iters=None,
        delta=0.001,
        rho=0.5,
        n_jobs=None,
        seed_per_row=False,
        verbose=False,
    ):

        if n_trees is None:
            n_trees = 5 + int(round((data.shape[0])**0.5 / 20.0))
        if n_iters is None:
            n_iters = max(5, int(round(np.log2(data.shape[0]))))

        self.n_trees = n_trees
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwds = metric_kwds
        self.leaf_size = leaf_size
        self.prune_level = pruning_level
        self.max_candidates = max_candidates
        self.n_iters = n_iters
        self.delta = delta
        self.rho = rho
        self.dim = data.shape[1]
        self.verbose = verbose

        data = check_array(data, dtype=np.float32, accept_sparse="csr")
        self._raw_data = data

        if not tree_init or n_trees == 0:
            self.tree_init = False
        else:
            self.tree_init = True

        metric_kwds = metric_kwds or {}
        self._dist_args = tuple(metric_kwds.values())

        self.random_state = check_random_state(random_state)

        if callable(metric):
            self._distance_func = metric
        elif metric in dist.named_distances:
            self._distance_func = dist.named_distances[metric]
        else:
            raise ValueError("Metric is neither callable, " +
                             "nor a recognised string")

        if metric in ("cosine", "correlation", "dice", "jaccard"):
            self._angular_trees = True
        else:
            self._angular_trees = False

        self.rng_state = self.random_state.randint(INT32_MIN, INT32_MAX,
                                                   3).astype(np.int64)

        if self.tree_init:
            if verbose:
                print(ts(), "Building RP forest with", str(n_trees), "trees")
            self._rp_forest = make_forest(
                data,
                n_neighbors,
                n_trees,
                leaf_size,
                self.rng_state,
                self._angular_trees,
            )
            leaf_array = rptree_leaf_array(self._rp_forest)
        else:
            self._rp_forest = None
            leaf_array = np.array([[-1]])

        if threaded.effective_n_jobs_with_context(n_jobs) != 1:
            if algorithm != "standard":
                raise ValueError(
                    "Algorithm {} not supported in parallel mode".format(
                        algorithm))
            if isspmatrix_csr(self._raw_data):
                raise ValueError(
                    "Sparse input is not currently supported in parallel mode")
            if verbose:
                print(ts(), "parallel NN descent for", str(n_iters),
                      "iterations")

            if isspmatrix_csr(self._raw_data):
                # Sparse case
                self._is_sparse = True
                if metric in sparse.sparse_named_distances:
                    self._distance_func = sparse.sparse_named_distances[metric]
                    if metric in sparse.sparse_need_n_features:
                        metric_kwds["n_features"] = self._raw_data.shape[1]
                    self._dist_args = tuple(metric_kwds.values())
                else:
                    raise ValueError(
                        "Metric {} not supported for sparse data".format(
                            metric))
                self._neighbor_graph = sparse_threaded.sparse_nn_descent(
                    self._raw_data.indices,
                    self._raw_data.indptr,
                    self._raw_data.data,
                    self._raw_data.shape[0],
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    self._distance_func,
                    self._dist_args,
                    self.n_iters,
                    self.delta,
                    self.rho,
                    rp_tree_init=self.tree_init,
                    leaf_array=leaf_array,
                    verbose=verbose,
                    n_jobs=n_jobs,
                    seed_per_row=seed_per_row,
                )
            else:
                # Regular case
                self._is_sparse = False
                self._neighbor_graph = threaded.nn_descent(
                    self._raw_data,
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    self._distance_func,
                    self._dist_args,
                    self.n_iters,
                    self.delta,
                    self.rho,
                    rp_tree_init=self.tree_init,
                    leaf_array=leaf_array,
                    verbose=verbose,
                    n_jobs=n_jobs,
                    seed_per_row=seed_per_row,
                )
        elif algorithm == "standard" or leaf_array.shape[0] == 1:
            if isspmatrix_csr(self._raw_data):

                self._is_sparse = True

                if metric in sparse.sparse_named_distances:
                    self._distance_func = sparse.sparse_named_distances[metric]
                    if metric in sparse.sparse_need_n_features:
                        metric_kwds["n_features"] = self._raw_data.shape[1]
                    self._dist_args = tuple(metric_kwds.values())
                else:
                    raise ValueError(
                        "Metric {} not supported for sparse data".format(
                            metric))

                if verbose:
                    print(ts(), "metric NN descent for", str(n_iters),
                          "iterations")

                self._neighbor_graph = sparse_nnd.sparse_nn_descent(
                    self._raw_data.indices,
                    self._raw_data.indptr,
                    self._raw_data.data,
                    self._raw_data.shape[0],
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    sparse_dist=self._distance_func,
                    dist_args=self._dist_args,
                    n_iters=self.n_iters,
                    rp_tree_init=False,
                    leaf_array=leaf_array,
                    verbose=verbose,
                )

            else:

                self._is_sparse = False

                if verbose:
                    print(ts(), "NN descent for", str(n_iters), "iterations")

                self._neighbor_graph = nn_descent(
                    self._raw_data,
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    self._distance_func,
                    self._dist_args,
                    self.n_iters,
                    self.delta,
                    self.rho,
                    rp_tree_init=True,
                    leaf_array=leaf_array,
                    verbose=verbose,
                    seed_per_row=seed_per_row,
                )
        elif algorithm == "alternative":

            self._is_sparse = False

            if verbose:
                print(ts(), "Using alternative algorithm")

            graph_heap, search_heap = initialize_heaps(
                self._raw_data,
                self.n_neighbors,
                leaf_array,
                self._distance_func,
                self._dist_args,
            )
            graph = lil_matrix((data.shape[0], data.shape[0]))
            graph.rows, graph.data = deheap_sort(graph_heap)
            graph = graph.maximum(graph.transpose())
            self._neighbor_graph = deheap_sort(
                initialized_nnd_search(
                    self._raw_data,
                    graph.indptr,
                    graph.indices,
                    search_heap,
                    self._raw_data,
                    self._distance_func,
                    self._dist_args,
                ))
        else:
            raise ValueError("Unknown algorithm selected")

        if np.any(self._neighbor_graph[0] < 0):
            warn("Failed to correctly find n_neighbors for some samples."
                 "Results may be less than ideal. Try re-running with"
                 "different parameters.")