def query(self, query, k_NN): ## Check if the index has already been built try: _ = self.index.kneighbors() except NotFittedError as NFE: err_str = f"Cannot query the kNN graph because it has not been" err_str += f" built! (Run kNNIndex.fit first!)" NFE.args[0] = err_str + "\n\n" + NFE.args[0] raise NFE if self.verbose: timer_str = f"Finding {k_NN} nearest neighbors in an existing kNN" timer_str += f" graph using an exact search and the {self.metric}" timer_str += f" metric..." timer = utl.Timer(timer_str, verbose=self.verbose) timer.__enter__() ## Find the indices and distances to the nearest neighbors of the ## queried points distances, NN_idx = self.index.kneighbors(query, n_neighbors=k_NN) ## Stop the watch if self.verbose: timer.__exit__() ## Return the indices of the nearest neighbors to the queried points ## *in the original graph* and the distances to those points. return NN_idx[:, :k_NN], distances[:, :k_NN]
def fit(self, X, k_NN): if self.verbose: timer_str = f"Finding {k_NN} nearest neighbors using an exact" timer_str += f" search and the {self.metric} metric..." timer = utl.Timer(timer_str, verbose=self.verbose) timer.__enter__() ## Get the data shape self.n_samples, self.n_features = X.shape[0], X.shape[1] ## Check k_NN k_NN = self._check_k(k_NN, self.n_samples) ## "Fit" the indices of the kNN to data self.index.fit(X) ## Return the indices and distances of the k_NN nearest neighbors. distances, NN_idx = self.index.kneighbors(n_neighbors=k_NN) if self.verbose: timer.__exit__() self.kNN_idx = NN_idx[:, :] self.kNN_dst = distances[:, :] ## Return the indices of the nearest neighbors and the distances ## to those neighbors. return self.kNN_idx.copy(), self.kNN_dst.copy()
def query(self, query, k_NN): if self.index is None: err_str = f"Cannot 'query' the kNN graph because it has not been" err_str += f" constructed! (Run kNNIndex.fit(X, k_NN))" raise ValueError(err_str) if self.verbose: timer_str = f"Finding {k_NN} nearest neighbors to query points in" timer_str += f" existing kNN graph using an approximate search and" timer_str += f" the '{self.metric}'' metric..." timer = utl.Timer(timer_str, verbose=self.verbose) timer.__enter__() ## Check query shape, if 1D array, reshape. if query.ndim == 1: query = query.copy().reshape(1, -1) elif query.ndim != 2: err_str = f"Input argument 'query' has an invalid shape: expected" err_str += f"2-D array, got {query.shape}." raise ValueError(err_str) ## Get number of query points. n_query = query.shape[0] ## Initialize NN indices and distances output NN_idx = np.zeros((n_query, k_NN)).astype(int) distances = np.zeros((n_query, k_NN)) ## Define helper function to get NNs def getnns(ii): NN_idx_ii, distances_ii = self.index.get_nns_by_vector( query[ii], k_NN, include_distances=True) NN_idx[ii] = NN_idx_ii[:] distances[ii] = distances_ii[:] ## Loop over query points ## If only one processor... if self.n_jobs == 1: for ii in range(n_query): getnns(ii) ## If more than one processor... else: from joblib import Parallel, delayed Parallel(n_jobs=self.n_jobs, require='sharedmem')(delayed(getnns)(ii) for ii in range(n_query)) if self.verbose: timer.__exit__() return NN_idx, distances
def spectral_init(P, n_components=2, scaling=_init_scaling, tol=1.e-4, max_iter=None, random_state=None, verbose=1): """Generate an initial embedding based on spectral decomp of the kNN graph. As noted in openTSNE, this method treats `P` as a hopping probability map and then diffuses samples based on their affinities. """ if verbose >= 2: timer = utl.Timer("Generating spectral initialization!") timer.__enter__() P = check_array(P, accept_sparse=True, ensure_2d=True) if P.shape[0] != P.shape[1]: err_str = f"The graph adjacency matrix (affinity matrix, `P`) must be" raise ValueError(err_str + f" square!") ## Get the row sums as a diagonal matrix row_sums = sp.diags(np.ravel(np.sum(P, axis=1))) ## Get the leading eigenvectors v0 = np.ones(P.shape[0]) / np.sqrt(P.shape[0]) evals, evecs = sp.linalg.eigsh(P, M=row_sums, k=n_components + 1, tol=tol, maxiter=max_iter, which='LM', v0=v0) ## Make sure the eigenvalues are sorted sort_idx = np.argsort(evals)[::-1] evecs = evecs[:, sort_idx] ## Multiply the eigenvectors by their eigenvalues evecs *= evals[sort_idx] ## Drop the leading eigenvector embedding = evecs[:, 1:] if verbose >= 2: timer.__exit() return rescale(embedding, scaling=scaling)
def pca_init(X, n_components=2, scaling=_init_scaling, random_state=None, verbose=1): if verbose >= 2: timer = utl.Timer("Generating PCA initialization!") timer.__enter__() pca = PCA(n_components=n_components, random_state=random_state) Y = pca.fit_transform(X) if verbose >= 2: timer.__exit() return rescale(Y, scaling=scaling)
def kernel(self, kNNIndex): timer_str = f"Calculating fixed-entropy Gaussian affinity matrix!" timer = utl.Timer(timer_str, verbose=self.verbose) timer.__enter__() tmp_kernel_params = self.kernel_params.copy() good_keys = ['perp_tol', 'max_iter', 'tau_init', 'tau_min', 'tau_max'] tmp_kernel_params = {k: tmp_kernel_params[k] for k in good_keys} k_NN = self.n_neighbors P, taus, rows = fit_Gaussian_toEntropy(kNNIndex.kNN_dst[:, :k_NN], self._perp_arr, **tmp_kernel_params) timer.__exit__() self.kernel_params['precisions'] = taus.copy() self.kernel_params['row_sums'] = rows.copy() return P
def query(self, query, k_NN): if self.index is None: err_str = f"Cannot 'query' the kNN graph because it has not been" err_str += f" constructed! (Run kNNIndex.fit(X, k_NN))" raise ValueError(err_str) if self.verbose: timer_str = f"Finding {k_NN} approximate nearest neighbors to" timer_str += f" query points in the existing NN graph using" timer_str += f" `pynndescent` and the '{self.metric}' metric..." timer = utl.Timer(timer_str, verbose=self.verbose) timer.__enter__() NN_idx, distances = self.index.query(query, k=k_NN) if self.verbose: timer.__exit__() return NN_idx, distances
def fit(self, X, k_NN): if self.verbose: timer_str = f"Finding {k_NN} approximate nearest neighbors using" timer_str += f" NNDescent and the '{self.metric}' metric..." timer = utl.Timer(timer_str, verbose=self.verbose) timer.__enter__() ## Get the data shape self.n_samples, self.n_features = X.shape[0], X.shape[1] k_NN = self._check_k(k_NN, self.n_samples) ## > These values were taken from UMAP, which we assume to be sensible ## > defaults [because the UMAP and pynndescent authors are the same.] ## - Pavlin Policar if self.n_trees is None: self.n_trees = 5 + int(round((self.n_samples**0.5) / 20)) if self.n_iters is None: self.n_iters = max(5, int(round(np.log2(self.n_samples)))) ## If `k_NN` > 15, use just the first 15 NN to build the approximate ## NN graph, then use query() to the rest of the desired neighbors. if k_NN <= 15: k_build = k_NN + 1 else: k_build = 15 import pynndescent self.index = pynndescent.NNDescent(X, n_neighbors=k_build, metric=self.metric, metric_kwds=self.metric_params, random_state=self.random_state, n_trees=self.n_trees, n_iters=self.n_iters, n_jobs=self.n_jobs, verbose=self.verbose, **self.pynnd_kws) ## If k_NN <= 15, we're in the clear! NN_idx, distances = self.index.neighbor_graph ## ... Except when pynndescent fails, then it puts a -1 in the index. n_failures = np.sum(NN_idx == -1) ## If k_NN > 15, use query() to get the indices and distances if k_NN > 15: self.index.prepare() NN_idx, distances = self.index.query(X, k=k_NN + 1) ## If pynndescent fails to find neighbors for some points, raise ERROR. if n_failures > 0: err_str = "WARNING: `pynndescent` failed to find neighbors for all" err_str += " points in the data." if self.verbose >= 4: print_opt = np.get_print_options() np.set_print_options(threshold=np.inf) err_str += " The indices of the failed points are: " err_str += f"\n{np.where(np.sum(NN_idx == -1, axis=1))[0]}" np.set_print_options(**print_opt) else: err_str += " Set verbose >= 4 to see the indices of the" err_str += " failed points." raise ValueError(err_str) if self.verbose: timer.__exit__() # return NN_idx[:, 1:], distances[:, 1:] self.kNN_idx = NN_idx[:, 1:] self.kNN_dst = distances[:, 1:] ## Return the indices of the nearest neighbors and the distances ## to those neighbors. return self.kNN_idx.copy(), self.kNN_dst.copy()
def fit(self, X, k_NN): if self.verbose: timer_str = f"Finding {k_NN} nearest neighbors using an" timer_str += f" approximate search and the {self.metric} metric..." timer = utl.Timer(timer_str, verbose=self.verbose) timer.__enter__() X = check_array(X, accept_sparse=True, ensure_2d=True) ## Get the data shape self.n_samples, self.n_features = X.shape[0], X.shape[1] ## Initialize the tree. self.index = self._initialize_ANNOY_index(self.n_features) ## Set the random seed. ## ANNOY uses only a 32-bit integer as a random seed. seed = self.random_state.get_state()[1][0] % (2**31) self.index.set_seed(seed) ## Add the data to the tree for ii in range(self.n_samples): self.index.add_item(ii, X[ii]) ## Build the requested number of trees. Default is 50. self.index.build(self.n_trees) ## Initialize output: NN indices and distances NN_idx = np.zeros((self.n_samples, k_NN)).astype(int) distances = np.zeros((self.n_samples, k_NN)) ## Define helper function to get NNs def getnns(ii): ## Annoy returns the query point as the first element, so we ask ## for k_NN + 1 neighbors. [aa, bb] = self.index.get_nns_by_item(ii, k_NN + 1, include_distances=True) ## Don't save the closest neighbor (the query point itself) NN_idx[ii] = aa[1:] distances[ii] = bb[1:] if self.n_jobs == 1: for ii in range(self.n_samples): getnns(ii) else: from joblib import Parallel, delayed Parallel(n_jobs=self.n_jobs, require="sharedmem")(delayed(getnns)(ii) for ii in range(self.n_samples)) if self.verbose: timer.__exit__() self.kNN_idx = NN_idx[:, :] self.kNN_dst = distances[:, :] ## Return the indices of the nearest neighbors and the distances ## to those neighbors. return self.kNN_idx.copy(), self.kNN_dst.copy()
def optimize(self, P, n_iter, exaggeration, momentum): ## If there is any exaggeration, modify `P`. if exaggeration != 1: P *= exaggeration ## Initialize the gradient and gains arrays. self._gradient = np.zeros_like(self.embedding, dtype=np.float64, order='C') if self._gains is None: self._gains = np.ones_like(self.embedding, dtype=np.float64, order='C') ## Initialize the update array to look like the gradient. update = np.zeros_like(self._gradient) ## Callbacks can have an initialization method, call that here. do_callbacks = True if isinstance(self._callbacks, Iterable): for cb in self._callbacks: getattr(cb, "optimization_about_to_start", lambda: ...)() else: do_callbacks = False ## Start the timer if we're worried about printing information. if self.verbose >= 1: timer_str = f"Fitting t-SNE for up to {n_iter} iterations with" timer_str += f" exaggeration = {exaggeration:.1f} and learning" timer_str += f" rate = {self.learning_rate:.1f}." timer = utl.Timer(timer_str) timer.__enter__() start_time = time() ## START THE LOOP! for ii in range(n_iter): ## Determine whether to do callbacks in this iteration. do_cbs_now = do_callbacks and \ ((ii + 1) % self.iter_per_callback == 0) ## Determine whether to calculate the error (D_KL) in this iter. calc_error = do_cbs_now or ((ii + 1) % self.iter_per_log == 0) ## Calculate the gradient and error if self.neg_grad_method.lower() in ['bh', 'barnes-hut']: d_kl = self._fit_bh(P, return_DKL=calc_error) elif self.neg_grad_method.lower() in ['fft', 'fit-sne', 'fitsne']: d_kl = self._fit_fft(P, return_DKL=calc_error) else: err_str = f"Currently, only Barnes-Hut and FIt-SNE methods" err_str += f" are supported for calculating t-SNE gradients." err_str += f" (`neg_grad_method` = 'BH' or 'FIt-SNE'.)" raise ValueError(err_str) ## If we are applying exaggeration, adjust the error (D_KL). if calc_error: if exaggeration != 1: d_kl = d_kl / exaggeration - np.log(exaggeration) self._errors.append([ii, d_kl]) ## To avoid samples flying off, we clip the gradient. if self.max_grad_norm is not None: norm = np.linalg.norm(self._gradient, axis=1) coeff = self.max_grad_norm / (norm + 1.e-6) mask = coeff < 1 ## Anywhere that the norm > max_grad_norm... self._gradient[mask] *= coeff[mask, None] ## If it's a callback iteration... if do_cbs_now: ## Do all the callbacks. cb_out = np.array([cb(ii + 1, d_kl, self.embedding) for cb in self._callbacks]).astype(bool) ## If any of the callbacks say to stop... if np.any(cb_out): ## ... fix the affinity matrix... if exaggeration != 1: P /= exaggeration ## ... report the runtime... if self.verbose >= 1: timer.__exit__() ## ... and quit the loop! raise OptimizationInterrupt(error=d_kl, final_embedding=self.embedding) ## Get where the last update and current gradient have diff signs grad_dir_flip = np.sign(update) != np.sign(self._gradient) grad_dir_same = np.invert(grad_dir_flip) self._gains[grad_dir_flip] += 0.2 self._gains[grad_dir_same] = self._gains[grad_dir_same] * 0.8 self._gains[grad_dir_same] += 0.01 ## Minimum gain ## Get the update update = momentum * update update -= self.learning_rate * self._gains * self._gradient ## To avoid samples flying off, we clip the update. if self.max_step_norm is not None: update_norm = np.linalg.norm(update, axis=1, keepdims=True) mask = update_norm.squeeze() > self.max_step_norm update[mask] /= update_norm[mask] update[mask] *= self.max_step_norm ## Update the embedding! self.embedding += update ## Recenter the embedding self.embedding -= np.mean(self.embedding, axis=0) ## Display progress! if (self.verbose >= 1) and ((ii + 1) % self.iter_per_log == 0): stop_time = time() dt = stop_time - start_time print(f"Itr {ii + 1:4d}, DKL {d_kl:6.4f},\t" f"{self.iter_per_log} iterations in {dt:.4f} sec") start_time = time() if self.verbose >= 1: timer.__exit__() ## Before returning, fix the affinity matrix for future optimizations. if exaggeration != 1: P /= exaggeration ## We also need to calculate the error one more time for the last loop. d_kl = self._fit_bh(P, return_DKL=True) self._errors.append([ii, d_kl]) return