def test_init_rp_tree(): # Use more data than the other tests since otherwise init_rp_tree has nothing to do np.random.seed(42) N = 100 D = 128 chunk_size = N // 8 n_neighbors = 25 data = np.random.rand(N, D).astype(np.float32) rng_state = new_rng_state() current_graph = pynndescent_.init_current_graph( data, dist, dist_args, n_neighbors, rng_state=rng_state, seed_per_row=True ) _rp_forest = make_forest(data, n_neighbors, n_trees=8, rng_state=rng_state) leaf_array = rptree_leaf_array(_rp_forest) pynndescent_.init_rp_tree(data, dist, dist_args, current_graph, leaf_array) rng_state = new_rng_state() current_graph_threaded = pynndescent_.init_current_graph( data, dist, dist_args, n_neighbors, rng_state=rng_state, seed_per_row=True ) _rp_forest = make_forest(data, n_neighbors, n_trees=8, rng_state=rng_state) leaf_array = rptree_leaf_array(_rp_forest) parallel = joblib.Parallel(n_jobs=2, prefer="threads") threaded.init_rp_tree( data, dist, dist_args, current_graph_threaded, leaf_array, chunk_size, parallel ) assert_allclose(current_graph_threaded, current_graph)
def _init_pynndescent(self, distances): from pynndescent import NNDescent self._use_pynndescent = True first_col = np.arange(distances.shape[0])[:, None] init_indices = np.hstack((first_col, np.stack(distances.tolil().rows))) self._nnd_idx = NNDescent( data=self._rep, metric=self._metric, metric_kwds=self._metric_kwds, n_neighbors=self._n_neighbors, init_graph=init_indices, random_state=self._neigh_random_state, ) # temporary hack for the broken forest storage from pynndescent.rp_trees import make_forest current_random_state = check_random_state(self._nnd_idx.random_state) self._nnd_idx._rp_forest = make_forest( self._nnd_idx._raw_data, self._nnd_idx.n_neighbors, self._nnd_idx.n_search_trees, self._nnd_idx.leaf_size, self._nnd_idx.rng_state, current_random_state, self._nnd_idx.n_jobs, self._nnd_idx._angular_trees, )
def test_init_rp_tree(): # Use more graph_data than the other tests since otherwise init_rp_tree has nothing to do np.random.seed(42) N = 100 D = 128 chunk_size = N // 8 n_neighbors = 25 data = np.random.rand(N, D).astype(np.float32) rng_state = new_rng_state() random_state = check_random_state(42) current_graph = utils.make_heap(data.shape[0], n_neighbors) _rp_forest = make_forest( data, n_neighbors, n_trees=8, leaf_size=None, rng_state=rng_state, random_state=random_state, ) leaf_array = rptree_leaf_array(_rp_forest) pynndescent_.init_rp_tree(data, dist, current_graph, leaf_array) rng_state = new_rng_state() random_state = check_random_state(42) current_graph_threaded = utils.make_heap(data.shape[0], n_neighbors) _rp_forest = make_forest( data, n_neighbors, n_trees=8, leaf_size=None, rng_state=rng_state, random_state=random_state, ) leaf_array = rptree_leaf_array(_rp_forest) parallel = joblib.Parallel(n_jobs=2, prefer="threads") threaded.init_rp_tree(data, dist, current_graph_threaded, leaf_array, chunk_size, parallel) assert_allclose(current_graph_threaded, current_graph)
def __init__( self, data, metric="euclidean", metric_kwds=None, n_neighbors=15, n_trees=None, leaf_size=None, pruning_level=0, tree_init=True, random_state=np.random, algorithm="standard", max_candidates=20, n_iters=None, delta=0.001, rho=0.5, n_jobs=None, seed_per_row=False, verbose=False, ): if n_trees is None: n_trees = 5 + int(round((data.shape[0])**0.5 / 20.0)) if n_iters is None: n_iters = max(5, int(round(np.log2(data.shape[0])))) self.n_trees = n_trees self.n_neighbors = n_neighbors self.metric = metric self.metric_kwds = metric_kwds self.leaf_size = leaf_size self.prune_level = pruning_level self.max_candidates = max_candidates self.n_iters = n_iters self.delta = delta self.rho = rho self.dim = data.shape[1] self.verbose = verbose data = check_array(data, dtype=np.float32, accept_sparse="csr") self._raw_data = data if not tree_init or n_trees == 0: self.tree_init = False else: self.tree_init = True metric_kwds = metric_kwds or {} self._dist_args = tuple(metric_kwds.values()) self.random_state = check_random_state(random_state) if callable(metric): self._distance_func = metric elif metric in dist.named_distances: self._distance_func = dist.named_distances[metric] else: raise ValueError("Metric is neither callable, " + "nor a recognised string") if metric in ("cosine", "correlation", "dice", "jaccard"): self._angular_trees = True else: self._angular_trees = False self.rng_state = self.random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) if self.tree_init: if verbose: print(ts(), "Building RP forest with", str(n_trees), "trees") self._rp_forest = make_forest( data, n_neighbors, n_trees, leaf_size, self.rng_state, self._angular_trees, ) leaf_array = rptree_leaf_array(self._rp_forest) else: self._rp_forest = None leaf_array = np.array([[-1]]) if threaded.effective_n_jobs_with_context(n_jobs) != 1: if algorithm != "standard": raise ValueError( "Algorithm {} not supported in parallel mode".format( algorithm)) if isspmatrix_csr(self._raw_data): raise ValueError( "Sparse input is not currently supported in parallel mode") if verbose: print(ts(), "parallel NN descent for", str(n_iters), "iterations") if isspmatrix_csr(self._raw_data): # Sparse case self._is_sparse = True if metric in sparse.sparse_named_distances: self._distance_func = sparse.sparse_named_distances[metric] if metric in sparse.sparse_need_n_features: metric_kwds["n_features"] = self._raw_data.shape[1] self._dist_args = tuple(metric_kwds.values()) else: raise ValueError( "Metric {} not supported for sparse data".format( metric)) self._neighbor_graph = sparse_threaded.sparse_nn_descent( self._raw_data.indices, self._raw_data.indptr, self._raw_data.data, self._raw_data.shape[0], self.n_neighbors, self.rng_state, self.max_candidates, self._distance_func, self._dist_args, self.n_iters, self.delta, self.rho, rp_tree_init=self.tree_init, leaf_array=leaf_array, verbose=verbose, n_jobs=n_jobs, seed_per_row=seed_per_row, ) else: # Regular case self._is_sparse = False self._neighbor_graph = threaded.nn_descent( self._raw_data, self.n_neighbors, self.rng_state, self.max_candidates, self._distance_func, self._dist_args, self.n_iters, self.delta, self.rho, rp_tree_init=self.tree_init, leaf_array=leaf_array, verbose=verbose, n_jobs=n_jobs, seed_per_row=seed_per_row, ) elif algorithm == "standard" or leaf_array.shape[0] == 1: if isspmatrix_csr(self._raw_data): self._is_sparse = True if metric in sparse.sparse_named_distances: self._distance_func = sparse.sparse_named_distances[metric] if metric in sparse.sparse_need_n_features: metric_kwds["n_features"] = self._raw_data.shape[1] self._dist_args = tuple(metric_kwds.values()) else: raise ValueError( "Metric {} not supported for sparse data".format( metric)) if verbose: print(ts(), "metric NN descent for", str(n_iters), "iterations") self._neighbor_graph = sparse_nnd.sparse_nn_descent( self._raw_data.indices, self._raw_data.indptr, self._raw_data.data, self._raw_data.shape[0], self.n_neighbors, self.rng_state, self.max_candidates, sparse_dist=self._distance_func, dist_args=self._dist_args, n_iters=self.n_iters, rp_tree_init=False, leaf_array=leaf_array, verbose=verbose, ) else: self._is_sparse = False if verbose: print(ts(), "NN descent for", str(n_iters), "iterations") self._neighbor_graph = nn_descent( self._raw_data, self.n_neighbors, self.rng_state, self.max_candidates, self._distance_func, self._dist_args, self.n_iters, self.delta, self.rho, rp_tree_init=True, leaf_array=leaf_array, verbose=verbose, seed_per_row=seed_per_row, ) elif algorithm == "alternative": self._is_sparse = False if verbose: print(ts(), "Using alternative algorithm") graph_heap, search_heap = initialize_heaps( self._raw_data, self.n_neighbors, leaf_array, self._distance_func, self._dist_args, ) graph = lil_matrix((data.shape[0], data.shape[0])) graph.rows, graph.data = deheap_sort(graph_heap) graph = graph.maximum(graph.transpose()) self._neighbor_graph = deheap_sort( initialized_nnd_search( self._raw_data, graph.indptr, graph.indices, search_heap, self._raw_data, self._distance_func, self._dist_args, )) else: raise ValueError("Unknown algorithm selected") if np.any(self._neighbor_graph[0] < 0): warn("Failed to correctly find n_neighbors for some samples." "Results may be less than ideal. Try re-running with" "different parameters.")