示例#1
0
    def query(self, query, k_NN):

        ## Check if the index has already been built
        try:
            _ = self.index.kneighbors()
        except NotFittedError as NFE:
            err_str = f"Cannot query the kNN graph because it has not been"
            err_str += f" built! (Run kNNIndex.fit first!)"
            NFE.args[0] = err_str + "\n\n" + NFE.args[0]
            raise NFE

        if self.verbose:
            timer_str = f"Finding {k_NN} nearest neighbors in an existing kNN"
            timer_str += f" graph using an exact search and the {self.metric}"
            timer_str += f" metric..."
            timer = utl.Timer(timer_str, verbose=self.verbose)
            timer.__enter__()

        ## Find the indices and distances to the nearest neighbors of the
        ## queried points
        distances, NN_idx = self.index.kneighbors(query, n_neighbors=k_NN)

        ## Stop the watch
        if self.verbose:
            timer.__exit__()

        ## Return the indices of the nearest neighbors to the queried points
        ## *in the original graph* and the distances to those points.
        return NN_idx[:, :k_NN], distances[:, :k_NN]
示例#2
0
    def fit(self, X, k_NN):

        if self.verbose:
            timer_str = f"Finding {k_NN} nearest neighbors using an exact"
            timer_str += f" search and the {self.metric} metric..."
            timer = utl.Timer(timer_str, verbose=self.verbose)
            timer.__enter__()

        ## Get the data shape
        self.n_samples, self.n_features = X.shape[0], X.shape[1]

        ## Check k_NN
        k_NN = self._check_k(k_NN, self.n_samples)

        ## "Fit" the indices of the kNN to data
        self.index.fit(X)

        ## Return the indices and distances of the k_NN nearest neighbors.
        distances, NN_idx = self.index.kneighbors(n_neighbors=k_NN)

        if self.verbose:
            timer.__exit__()

        self.kNN_idx = NN_idx[:, :]
        self.kNN_dst = distances[:, :]

        ## Return the indices of the nearest neighbors and the distances
        ## to those neighbors.
        return self.kNN_idx.copy(), self.kNN_dst.copy()
示例#3
0
    def query(self, query, k_NN):

        if self.index is None:
            err_str = f"Cannot 'query' the kNN graph because it has not been"
            err_str += f" constructed!  (Run kNNIndex.fit(X, k_NN))"
            raise ValueError(err_str)

        if self.verbose:
            timer_str = f"Finding {k_NN} nearest neighbors to query points in"
            timer_str += f" existing kNN graph using an approximate search and"
            timer_str += f" the '{self.metric}'' metric..."
            timer = utl.Timer(timer_str, verbose=self.verbose)
            timer.__enter__()

        ## Check query shape, if 1D array, reshape.
        if query.ndim == 1:
            query = query.copy().reshape(1, -1)
        elif query.ndim != 2:
            err_str = f"Input argument 'query' has an invalid shape: expected"
            err_str += f"2-D array, got {query.shape}."
            raise ValueError(err_str)

        ## Get number of query points.
        n_query = query.shape[0]

        ## Initialize NN indices and distances output
        NN_idx = np.zeros((n_query, k_NN)).astype(int)
        distances = np.zeros((n_query, k_NN))

        ## Define helper function to get NNs
        def getnns(ii):
            NN_idx_ii, distances_ii = self.index.get_nns_by_vector(
                query[ii], k_NN, include_distances=True)

            NN_idx[ii] = NN_idx_ii[:]
            distances[ii] = distances_ii[:]

        ## Loop over query points
        ## If only one processor...
        if self.n_jobs == 1:
            for ii in range(n_query):
                getnns(ii)
        ## If more than one processor...
        else:
            from joblib import Parallel, delayed

            Parallel(n_jobs=self.n_jobs,
                     require='sharedmem')(delayed(getnns)(ii)
                                          for ii in range(n_query))

        if self.verbose:
            timer.__exit__()

        return NN_idx, distances
示例#4
0
def spectral_init(P,
                  n_components=2,
                  scaling=_init_scaling,
                  tol=1.e-4,
                  max_iter=None,
                  random_state=None,
                  verbose=1):
    """Generate an initial embedding based on spectral decomp of the kNN graph.

    As noted in openTSNE, this method treats `P` as a hopping probability map
    and then diffuses samples based on their affinities.

    """

    if verbose >= 2:
        timer = utl.Timer("Generating spectral initialization!")
        timer.__enter__()

    P = check_array(P, accept_sparse=True, ensure_2d=True)
    if P.shape[0] != P.shape[1]:
        err_str = f"The graph adjacency matrix (affinity matrix, `P`) must be"
        raise ValueError(err_str + f" square!")

    ## Get the row sums as a diagonal matrix
    row_sums = sp.diags(np.ravel(np.sum(P, axis=1)))

    ## Get the leading eigenvectors
    v0 = np.ones(P.shape[0]) / np.sqrt(P.shape[0])
    evals, evecs = sp.linalg.eigsh(P,
                                   M=row_sums,
                                   k=n_components + 1,
                                   tol=tol,
                                   maxiter=max_iter,
                                   which='LM',
                                   v0=v0)

    ## Make sure the eigenvalues are sorted
    sort_idx = np.argsort(evals)[::-1]
    evecs = evecs[:, sort_idx]

    ## Multiply the eigenvectors by their eigenvalues
    evecs *= evals[sort_idx]

    ## Drop the leading eigenvector
    embedding = evecs[:, 1:]

    if verbose >= 2:
        timer.__exit()

    return rescale(embedding, scaling=scaling)
示例#5
0
def pca_init(X,
             n_components=2,
             scaling=_init_scaling,
             random_state=None,
             verbose=1):

    if verbose >= 2:
        timer = utl.Timer("Generating PCA initialization!")
        timer.__enter__()

    pca = PCA(n_components=n_components, random_state=random_state)
    Y = pca.fit_transform(X)

    if verbose >= 2:
        timer.__exit()

    return rescale(Y, scaling=scaling)
示例#6
0
    def kernel(self, kNNIndex):

        timer_str = f"Calculating fixed-entropy Gaussian affinity matrix!"
        timer = utl.Timer(timer_str, verbose=self.verbose)
        timer.__enter__()

        tmp_kernel_params = self.kernel_params.copy()
        good_keys = ['perp_tol', 'max_iter', 'tau_init', 'tau_min', 'tau_max']
        tmp_kernel_params = {k: tmp_kernel_params[k] for k in good_keys}

        k_NN = self.n_neighbors
        P, taus, rows = fit_Gaussian_toEntropy(kNNIndex.kNN_dst[:, :k_NN],
                                               self._perp_arr,
                                               **tmp_kernel_params)
        timer.__exit__()

        self.kernel_params['precisions'] = taus.copy()
        self.kernel_params['row_sums'] = rows.copy()

        return P
示例#7
0
    def query(self, query, k_NN):

        if self.index is None:
            err_str = f"Cannot 'query' the kNN graph because it has not been"
            err_str += f" constructed!  (Run kNNIndex.fit(X, k_NN))"
            raise ValueError(err_str)

        if self.verbose:
            timer_str = f"Finding {k_NN} approximate nearest neighbors to"
            timer_str += f" query points in the existing NN graph using"
            timer_str += f" `pynndescent` and the '{self.metric}' metric..."
            timer = utl.Timer(timer_str, verbose=self.verbose)
            timer.__enter__()

        NN_idx, distances = self.index.query(query, k=k_NN)

        if self.verbose:
            timer.__exit__()

        return NN_idx, distances
示例#8
0
    def fit(self, X, k_NN):

        if self.verbose:
            timer_str = f"Finding {k_NN} approximate nearest neighbors using"
            timer_str += f" NNDescent and the '{self.metric}' metric..."
            timer = utl.Timer(timer_str, verbose=self.verbose)
            timer.__enter__()

        ## Get the data shape
        self.n_samples, self.n_features = X.shape[0], X.shape[1]

        k_NN = self._check_k(k_NN, self.n_samples)

        ## > These values were taken from UMAP, which we assume to be sensible
        ## > defaults [because the UMAP and pynndescent authors are the same.]
        ## - Pavlin Policar
        if self.n_trees is None:
            self.n_trees = 5 + int(round((self.n_samples**0.5) / 20))
        if self.n_iters is None:
            self.n_iters = max(5, int(round(np.log2(self.n_samples))))

        ## If `k_NN` > 15, use just the first 15 NN to build the approximate
        ## NN graph, then use query() to the rest of the desired neighbors.
        if k_NN <= 15:
            k_build = k_NN + 1
        else:
            k_build = 15

        import pynndescent
        self.index = pynndescent.NNDescent(X,
                                           n_neighbors=k_build,
                                           metric=self.metric,
                                           metric_kwds=self.metric_params,
                                           random_state=self.random_state,
                                           n_trees=self.n_trees,
                                           n_iters=self.n_iters,
                                           n_jobs=self.n_jobs,
                                           verbose=self.verbose,
                                           **self.pynnd_kws)

        ## If k_NN <= 15, we're in the clear!
        NN_idx, distances = self.index.neighbor_graph

        ## ... Except when pynndescent fails, then it puts a -1 in the index.
        n_failures = np.sum(NN_idx == -1)

        ## If k_NN > 15, use query() to get the indices and distances
        if k_NN > 15:
            self.index.prepare()
            NN_idx, distances = self.index.query(X, k=k_NN + 1)

        ## If pynndescent fails to find neighbors for some points, raise ERROR.
        if n_failures > 0:
            err_str = "WARNING: `pynndescent` failed to find neighbors for all"
            err_str += " points in the data."

            if self.verbose >= 4:
                print_opt = np.get_print_options()
                np.set_print_options(threshold=np.inf)
                err_str += " The indices of the failed points are: "
                err_str += f"\n{np.where(np.sum(NN_idx == -1, axis=1))[0]}"
                np.set_print_options(**print_opt)
            else:
                err_str += " Set verbose >= 4 to see the indices of the"
                err_str += " failed points."

            raise ValueError(err_str)

        if self.verbose:
            timer.__exit__()

        # return NN_idx[:, 1:], distances[:, 1:]
        self.kNN_idx = NN_idx[:, 1:]
        self.kNN_dst = distances[:, 1:]

        ## Return the indices of the nearest neighbors and the distances
        ## to those neighbors.
        return self.kNN_idx.copy(), self.kNN_dst.copy()
示例#9
0
    def fit(self, X, k_NN):

        if self.verbose:
            timer_str = f"Finding {k_NN} nearest neighbors using an"
            timer_str += f" approximate search and the {self.metric} metric..."
            timer = utl.Timer(timer_str, verbose=self.verbose)
            timer.__enter__()

        X = check_array(X, accept_sparse=True, ensure_2d=True)

        ## Get the data shape
        self.n_samples, self.n_features = X.shape[0], X.shape[1]

        ## Initialize the tree.
        self.index = self._initialize_ANNOY_index(self.n_features)

        ## Set the random seed.
        ## ANNOY uses only a 32-bit integer as a random seed.
        seed = self.random_state.get_state()[1][0] % (2**31)
        self.index.set_seed(seed)

        ## Add the data to the tree
        for ii in range(self.n_samples):
            self.index.add_item(ii, X[ii])

        ## Build the requested number of trees.  Default is 50.
        self.index.build(self.n_trees)

        ## Initialize output: NN indices and distances
        NN_idx = np.zeros((self.n_samples, k_NN)).astype(int)
        distances = np.zeros((self.n_samples, k_NN))

        ## Define helper function to get NNs
        def getnns(ii):
            ## Annoy returns the query point as the first element, so we ask
            ## for k_NN + 1 neighbors.
            [aa, bb] = self.index.get_nns_by_item(ii,
                                                  k_NN + 1,
                                                  include_distances=True)

            ## Don't save the closest neighbor (the query point itself)
            NN_idx[ii] = aa[1:]
            distances[ii] = bb[1:]

        if self.n_jobs == 1:
            for ii in range(self.n_samples):
                getnns(ii)
        else:
            from joblib import Parallel, delayed

            Parallel(n_jobs=self.n_jobs,
                     require="sharedmem")(delayed(getnns)(ii)
                                          for ii in range(self.n_samples))

        if self.verbose:
            timer.__exit__()

        self.kNN_idx = NN_idx[:, :]
        self.kNN_dst = distances[:, :]

        ## Return the indices of the nearest neighbors and the distances
        ## to those neighbors.
        return self.kNN_idx.copy(), self.kNN_dst.copy()
示例#10
0
    def optimize(self,
                 P,
                 n_iter,
                 exaggeration,
                 momentum):

        ## If there is any exaggeration, modify `P`.
        if exaggeration != 1:
            P *= exaggeration

        ## Initialize the gradient and gains arrays.
        self._gradient = np.zeros_like(self.embedding,
                                       dtype=np.float64,
                                       order='C')
        if self._gains is None:
            self._gains = np.ones_like(self.embedding,
                                       dtype=np.float64,
                                       order='C')

        ## Initialize the update array to look like the gradient.
        update = np.zeros_like(self._gradient)

        ## Callbacks can have an initialization method, call that here.
        do_callbacks = True
        if isinstance(self._callbacks, Iterable):
            for cb in self._callbacks:
                getattr(cb, "optimization_about_to_start", lambda: ...)()
        else:
            do_callbacks = False

        ## Start the timer if we're worried about printing information.
        if self.verbose >= 1:
            timer_str  = f"Fitting t-SNE for up to {n_iter} iterations with"
            timer_str += f" exaggeration = {exaggeration:.1f} and learning"
            timer_str += f" rate = {self.learning_rate:.1f}."
            timer = utl.Timer(timer_str)
            timer.__enter__()

        start_time = time()

        ## START THE LOOP!
        for ii in range(n_iter):

            ## Determine whether to do callbacks in this iteration.
            do_cbs_now = do_callbacks and \
                ((ii + 1) % self.iter_per_callback == 0)
            ## Determine whether to calculate the error (D_KL) in this iter.
            calc_error = do_cbs_now or ((ii + 1) % self.iter_per_log == 0)

            ## Calculate the gradient and error
            if self.neg_grad_method.lower() in ['bh', 'barnes-hut']:
                d_kl = self._fit_bh(P, return_DKL=calc_error)

            elif self.neg_grad_method.lower() in ['fft', 'fit-sne', 'fitsne']:
                d_kl = self._fit_fft(P, return_DKL=calc_error)

            else:
                err_str  = f"Currently, only Barnes-Hut and FIt-SNE methods"
                err_str += f" are supported for calculating t-SNE gradients."
                err_str += f" (`neg_grad_method` = 'BH' or 'FIt-SNE'.)"
                raise ValueError(err_str)

            ## If we are applying exaggeration, adjust the error (D_KL).
            if calc_error:
                if exaggeration != 1:
                    d_kl = d_kl / exaggeration - np.log(exaggeration)

                self._errors.append([ii, d_kl])

            ## To avoid samples flying off, we clip the gradient.
            if self.max_grad_norm is not None:
                norm  = np.linalg.norm(self._gradient, axis=1)
                coeff = self.max_grad_norm / (norm + 1.e-6)
                mask  = coeff < 1  ## Anywhere that the norm > max_grad_norm...
                self._gradient[mask] *= coeff[mask, None]

            ## If it's a callback iteration...
            if do_cbs_now:

                ## Do all the callbacks.

                cb_out = np.array([cb(ii + 1, d_kl, self.embedding)
                                   for cb in self._callbacks]).astype(bool)

                ## If any of the callbacks say to stop...
                if np.any(cb_out):
                    ## ... fix the affinity matrix...
                    if exaggeration != 1:
                        P /= exaggeration
                    ## ... report the runtime...
                    if self.verbose >= 1:
                        timer.__exit__()
                    ## ... and quit the loop!
                    raise OptimizationInterrupt(error=d_kl,
                                                final_embedding=self.embedding)

            ## Get where the last update and current gradient have diff signs
            grad_dir_flip = np.sign(update) != np.sign(self._gradient)
            grad_dir_same = np.invert(grad_dir_flip)
            self._gains[grad_dir_flip] += 0.2
            self._gains[grad_dir_same]  = self._gains[grad_dir_same] * 0.8
            self._gains[grad_dir_same] += 0.01  ## Minimum gain

            ## Get the update
            update  = momentum * update
            update -= self.learning_rate * self._gains * self._gradient

            ## To avoid samples flying off, we clip the update.
            if self.max_step_norm is not None:
                update_norm  = np.linalg.norm(update, axis=1, keepdims=True)
                mask = update_norm.squeeze() > self.max_step_norm
                update[mask] /= update_norm[mask]
                update[mask] *= self.max_step_norm

            ## Update the embedding!
            self.embedding += update

            ## Recenter the embedding
            self.embedding -= np.mean(self.embedding, axis=0)

            ## Display progress!
            if (self.verbose >= 1) and ((ii + 1) % self.iter_per_log == 0):
                stop_time = time()
                dt = stop_time - start_time
                print(f"Itr {ii + 1:4d}, DKL {d_kl:6.4f},\t"
                      f"{self.iter_per_log} iterations in {dt:.4f} sec")
                start_time = time()

        if self.verbose >= 1:
            timer.__exit__()

        ## Before returning, fix the affinity matrix for future optimizations.
        if exaggeration != 1:
            P /= exaggeration

        ## We also need to calculate the error one more time for the last loop.
        d_kl = self._fit_bh(P, return_DKL=True)

        self._errors.append([ii, d_kl])

        return