示例#1
0
    def __init__(
        self,
        data,
        sigma,
        k=30,
        method="auto",
        metric="euclidean",
        metric_params=None,
        symmetrize=True,
        n_jobs=1,
        random_state=None,
        verbose=False,
    ):
        self.n_samples = n_samples = data.shape[0]
        self.verbose = verbose

        if k >= self.n_samples:
            raise ValueError(
                "`k` (%d) cannot be larger than N-1 (%d)." % (k, self.n_samples)
            )

        with utils.Timer(
            f"Finding {k} nearest neighbors using {method} search with {metric} metric...",
            self.verbose,
        ):
            knn_index, neighbors, distances = build_knn_index(
                data, method, k, metric, metric_params, n_jobs, random_state
            )

        self.knn_index = knn_index

        with utils.Timer("Calcualting affinity matrix...", self.verbose):
            # Compute asymmetric pairwise input similarities
            conditional_P = np.exp(-(distances ** 2) / (2 * sigma ** 2))
            conditional_P /= np.sum(conditional_P, axis=1)[:, np.newaxis]

            P = sp.csr_matrix(
                (
                    conditional_P.ravel(),
                    neighbors.ravel(),
                    range(0, n_samples * k + 1, k),
                ),
                shape=(n_samples, n_samples),
            )

            # Symmetrize the probability matrix
            if symmetrize:
                P = (P + P.T) / 2

            # Convert weights to probabilities
            P /= np.sum(P)

        self.sigma = sigma
        self.k = k
        self.P = P
        self.n_jobs = n_jobs
示例#2
0
    def __init__(
        self,
        data,
        perplexity=30,
        method="auto",
        metric="euclidean",
        metric_params=None,
        symmetrize=True,
        n_jobs=1,
        random_state=None,
        verbose=False,
    ):
        self.n_samples = data.shape[0]
        self.perplexity = self.check_perplexity(perplexity)
        self.verbose = verbose

        k_neighbors = min(self.n_samples - 1, int(3 * self.perplexity))
        self.knn_index, self.__neighbors, self.__distances = build_knn_index(
            data, method, k_neighbors, metric, metric_params, n_jobs,
            random_state, verbose)

        with utils.Timer("Calculating affinity matrix...", self.verbose):
            self.P = joint_probabilities_nn(
                self.__neighbors,
                self.__distances,
                [self.perplexity],
                symmetrize=symmetrize,
                n_jobs=n_jobs,
            )

        self.n_jobs = n_jobs
    def build(self):
        data, k = self.data, self.k

        timer = utils.Timer(
            f"Finding {k} nearest neighbors using Annoy approximate search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        from openTSNE.dependencies.annoy import AnnoyIndex

        N = data.shape[0]

        annoy_metric = self.metric
        annoy_aliases = {
            "cosine": "angular",
            "l1": "manhattan",
            "l2": "euclidean",
            "taxicab": "manhattan",
        }
        if annoy_metric in annoy_aliases:
            annoy_metric = annoy_aliases[annoy_metric]

        self.index = AnnoyIndex(data.shape[1], annoy_metric)

        random_state = check_random_state(self.random_state)
        self.index.set_seed(random_state.randint(np.iinfo(np.int32).max))

        for i in range(N):
            self.index.add_item(i, data[i])

        # Number of trees. FIt-SNE uses 50 by default.
        self.index.build(50, n_jobs=self.n_jobs)

        # Return the nearest neighbors in the training set
        distances = np.zeros((N, k))
        indices = np.zeros((N, k)).astype(int)

        def getnns(i):
            # Annoy returns the query point itself as the first element
            indices_i, distances_i = self.index.get_nns_by_item(
                i, k + 1, include_distances=True
            )
            indices[i] = indices_i[1:]
            distances[i] = distances_i[1:]

        if self.n_jobs == 1:
            for i in range(N):
                getnns(i)
        else:
            from joblib import Parallel, delayed

            Parallel(n_jobs=self.n_jobs, require="sharedmem")(
                delayed(getnns)(i) for i in range(N)
            )

        timer.__exit__()

        return indices, distances
示例#4
0
    def query(self, query, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors in existing embedding using Annoy "
            f"approximate search...",
            self.verbose,
        )
        timer.__enter__()

        N = query.shape[0]
        distances = np.zeros((N, k))
        indices = np.zeros((N, k)).astype(int)

        def getnns(i):
            indices[i], distances[i] = self.index.get_nns_by_vector(
                query[i], k, include_distances=True
            )

        if self.n_jobs == 1:
            for i in range(N):
                getnns(i)
        else:
            from joblib import Parallel, delayed

            Parallel(n_jobs=self.n_jobs, require="sharedmem")(
                delayed(getnns)(i) for i in range(N)
            )

        timer.__exit__()

        return indices, distances
示例#5
0
    def __init__(
        self,
        data=None,
        perplexities=None,
        method="auto",
        metric="euclidean",
        metric_params=None,
        symmetrize=True,
        n_jobs=1,
        random_state=None,
        verbose=False,
        knn_index=None,
    ):
        # Perplexities must be specified, but has default set to none, so the
        # parameter order makes more sense
        if perplexities is None:
            raise ValueError("`perplexities` must be specified!")

        # This can't work if neither data nor the knn index are specified
        if data is None and knn_index is None:
            raise ValueError(
                "At least one of the parameters `data` or `knn_index` must be specified!"
            )
        # This can't work if both data and the knn index are specified
        if data is not None and knn_index is not None:
            raise ValueError(
                "Both `data` or `knn_index` were specified! Please pass only one."
            )

        # Find the nearest neighbors
        if knn_index is None:
            # We will compute the nearest neighbors to the max value of perplexity,
            # smaller values can just use indexing to truncate unneeded neighbors
            n_samples = data.shape[0]
            perplexities = self.check_perplexities(perplexities, n_samples)
            max_perplexity = np.max(perplexities)
            k_neighbors = min(n_samples - 1, int(3 * max_perplexity))

            self.knn_index = get_knn_index(data, method, k_neighbors, metric,
                                           metric_params, n_jobs, random_state,
                                           verbose)

        else:
            self.knn_index = knn_index
            log.info("KNN index provided. Ignoring KNN-related parameters.")

        self.__neighbors, self.__distances = self.knn_index.build()

        with utils.Timer("Calculating affinity matrix...", verbose):
            self.P = self._calculate_P(
                self.__neighbors,
                self.__distances,
                perplexities,
                symmetrize=symmetrize,
                n_jobs=n_jobs,
            )

        self.perplexities = perplexities
        self.n_jobs = n_jobs
        self.verbose = verbose
示例#6
0
def weighted_mean(X, embedding, neighbors, distances, verbose=False):
    """Initialize points onto an existing embedding by placing them in the
    weighted mean position of their nearest neighbors on the reference embedding.

    Parameters
    ----------
    X: np.ndarray
    embedding: TSNEEmbedding
    neighbors: np.ndarray
    distances: np.ndarray
    verbose: bool

    Returns
    -------
    np.ndarray

    """
    n_samples = X.shape[0]
    n_components = embedding.shape[1]

    with utils.Timer("Calculating weighted-mean initialization...", verbose):
        partial_embedding = np.zeros((n_samples, n_components), order="C")
        for i in range(n_samples):
            partial_embedding[i] = np.average(embedding[neighbors[i]],
                                              axis=0,
                                              weights=distances[i])

    return partial_embedding
示例#7
0
    def __init__(
        self,
        data,
        perplexities,
        method="auto",
        metric="euclidean",
        metric_params=None,
        symmetrize=True,
        n_jobs=1,
        random_state=None,
        verbose=False,
    ):
        self.n_samples = data.shape[0]
        self.verbose = verbose

        # We will compute the nearest neighbors to the max value of perplexity,
        # smaller values can just use indexing to truncate unneeded neighbors
        perplexities = self.check_perplexities(perplexities)
        max_perplexity = np.max(perplexities)
        k_neighbors = min(self.n_samples - 1, int(3 * max_perplexity))

        with utils.Timer(
            f"Finding {k_neighbors} nearest neighbors using {method} search "
            f"with {metric} metric...",
            self.verbose,
        ):
            self.knn_index, self.__neighbors, self.__distances = build_knn_index(
                data, method, k_neighbors, metric, metric_params, n_jobs, random_state
            )

        with utils.Timer("Calcualting affinity matrix...", self.verbose):
            self.P = self._calculate_P(
                self.__neighbors,
                self.__distances,
                perplexities,
                symmetrize=symmetrize,
                n_jobs=n_jobs,
            )

        self.perplexities = perplexities
        self.n_jobs = n_jobs
示例#8
0
    def set_perplexity(self, new_perplexity):
        """Change the perplexity of the affinity matrix.

        Note that we only allow setting the perplexity to a value not larger
        than the number of neighbors used for the original perplexity. This
        restriction exists because setting a higher perplexity value requires
        recomputing all the nearest neighbors, which can take a long time.
        To avoid potential confusion as to why execution time is slow, this
        is not allowed. If you would like to increase the perplexity above
        that value, simply create a new instance.

        Parameters
        ----------
        new_perplexity: float
            The new perplexity.

        """
        # If the value hasn't changed, there's nothing to do
        if new_perplexity == self.perplexity:
            return
        # Verify that the perplexity isn't negative
        effective_perplexity = self.check_perplexity(new_perplexity, np.inf)
        # Verify that the perplexity isn't too large for the kNN graph
        if effective_perplexity > self.__neighbors.shape[1]:
            raise RuntimeError(
                "The desired perplexity `%.2f` is larger than the kNN graph "
                "allows. This would need to recompute the nearest neighbors, "
                "which is not efficient. Please create a new `%s` instance "
                "with the increased perplexity." %
                (effective_perplexity, self.__class__.__name__))
        # Warn if the perplexity is larger than the heuristic
        if 3 * effective_perplexity > self.__neighbors.shape[1]:
            log.warning(
                "The new perplexity is quite close to the computed number of "
                "nearest neighbors. The results may be unexpected. Consider "
                "creating a new `%s` instance with the increased perplexity." %
                self.__class__.__name__)

        # Recompute the affinity matrix
        self.perplexity = new_perplexity
        self.effective_perplexity_ = effective_perplexity
        k_neighbors = int(3 * new_perplexity)

        with utils.Timer("Perplexity changed. Recomputing affinity matrix...",
                         self.verbose):
            self.P = joint_probabilities_nn(
                self.__neighbors[:, :k_neighbors],
                self.__distances[:, :k_neighbors],
                [self.effective_perplexity_],
                symmetrize=self.symmetrize,
                n_jobs=self.n_jobs,
            )
示例#9
0
    def query(self, query, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors in existing embedding using NN Descent "
            f"approxmimate search...",
            self.verbose,
        )
        timer.__enter__()

        indices, distances = self.index.query(query, k=k)

        timer.__exit__()

        return indices, distances
    def build(self):
        data, k = self.data, self.k

        timer = utils.Timer(
            f"Finding {k} nearest neighbors using exact search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        if self.metric == "cosine":
            # The nearest neighbor ranking for cosine distance is the same as
            # for euclidean distance on normalized data
            effective_metric = "euclidean"
            effective_data = data.copy()
            effective_data = (
                effective_data / np.linalg.norm(effective_data, axis=1)[:, None]
            )
            # In order to properly compute cosine distances when querying the
            # index, we need to store the original data
            self.__data = data
        else:
            effective_metric = self.metric
            effective_data = data

        self.index = neighbors.NearestNeighbors(
            algorithm="auto",
            metric=effective_metric,
            metric_params=self.metric_params,
            n_jobs=self.n_jobs,
        )
        self.index.fit(effective_data)

        # Return the nearest neighbors in the training set
        distances, indices = self.index.kneighbors(n_neighbors=k)

        # If using cosine distance, the computed distances will be wrong and
        # need to be recomputed
        if self.metric == "cosine":
            distances = np.vstack(
                [
                    cdist(np.atleast_2d(x), data[idx], metric="cosine")
                    for x, idx in zip(data, indices)
                ]
            )

        timer.__exit__()

        return indices, distances
示例#11
0
def pca(X,
        n_components=2,
        svd_solver="auto",
        random_state=None,
        verbose=False):
    """Initialize an embedding using the top principal components.

    Parameters
    ----------
    X: np.ndarray
        The data matrix.

    n_components: int
        The dimension of the embedding space.

    svd_solver: str
        See sklearn.decomposition.PCA documentation.

    random_state: Union[int, RandomState]
        If the value is an int, random_state is the seed used by the random
        number generator. If the value is a RandomState instance, then it will
        be used as the random number generator. If the value is None, the random
        number generator is the RandomState instance used by `np.random`.

    verbose: bool

    Returns
    -------
    initialization: np.ndarray

    """
    timer = utils.Timer("Calculating PCA-based initialization...", verbose)
    timer.__enter__()

    pca_ = PCA(n_components=n_components,
               svd_solver=svd_solver,
               random_state=random_state)
    embedding = pca_.fit_transform(X)

    # The PCA embedding may have high variance, which leads to poor convergence
    normalization = np.std(embedding[:, 0])
    normalization /= 0.0001
    embedding /= normalization

    timer.__exit__()

    return np.ascontiguousarray(embedding)
    def build(self):
        data, k = self.data, self.k

        timer = utils.Timer(
            f"Finding {k} nearest neighbors using HNSWlib approximate search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        from hnswlib import Index

        hnsw_space = {
            "cosine": "cosine",
            "dot": "ip",
            "euclidean": "l2",
            "ip": "ip",
            "l2": "l2",
        }[self.metric]

        random_state = check_random_state(self.random_state)
        random_seed = random_state.randint(np.iinfo(np.int32).max)

        self.index = Index(space=hnsw_space, dim=data.shape[1])

        # Initialize HNSW Index
        self.index.init_index(
            max_elements=data.shape[0],
            ef_construction=200,
            M=16,
            random_seed=random_seed,
        )

        # Build index tree from data
        self.index.add_items(data, num_threads=self.n_jobs)

        # Set ef parameter for (ideal) precision/recall
        self.index.set_ef(min(2 * k, self.index.get_current_count()))

        # Query for kNN
        indices, distances = self.index.knn_query(data, k=k + 1, num_threads=self.n_jobs)

        # Stop timer
        timer.__exit__()

        # return indices and distances, skip first entry, which is always the point itself
        return indices[:, 1:], distances[:, 1:]
示例#13
0
def median(embedding, neighbors, verbose=False):
    """Initialize points onto an existing embedding by placing them in the
    median position of their nearest neighbors on the reference embedding.

    Parameters
    ----------
    embedding: TSNEEmbedding
    neighbors: np.ndarray
    verbose: bool

    Returns
    -------
    np.ndarray

    """
    with utils.Timer("Calculating meadian initialization...", verbose):
        embedding = np.median(embedding[neighbors], axis=1)
    return np.ascontiguousarray(embedding)
示例#14
0
    def set_perplexity(self, new_perplexity):
        """Change the perplexity of the affinity matrix.

        Note that we only allow lowering the perplexity or restoring it to its
        original value. This restriction exists because setting a higher
        perplexity value requires recomputing all the nearest neighbors, which
        can take a long time. To avoid potential confusion as to why execution
        time is slow, this is not allowed. If you would like to increase the
        perplexity above the initial value, simply create a new instance.

        Parameters
        ----------
        new_perplexity: float
            The new perplexity.

        """
        # If the value hasn't changed, there's nothing to do
        if new_perplexity == self.perplexity:
            return
        # Verify that the perplexity isn't too large
        new_perplexity = self.check_perplexity(new_perplexity)
        # Recompute the affinity matrix
        k_neighbors = min(self.n_samples - 1, int(3 * new_perplexity))
        if k_neighbors > self.__neighbors.shape[1]:
            raise RuntimeError(
                "The desired perplexity `%.2f` is larger than the initial one "
                "used. This would need to recompute the nearest neighbors, "
                "which is not efficient. Please create a new `%s` instance "
                "with the increased perplexity."
                % (new_perplexity, self.__class__.__name__)
            )

        self.perplexity = new_perplexity

        with utils.Timer(
            "Perplexity changed. Recomputing affinity matrix...", self.verbose
        ):
            self.P = joint_probabilities_nn(
                self.__neighbors[:, :k_neighbors],
                self.__distances[:, :k_neighbors],
                [self.perplexity],
                symmetrize=True,
                n_jobs=self.n_jobs,
            )
示例#15
0
    def set_perplexities(self, new_perplexities):
        """Change the perplexities of the affinity matrix.

        Note that we only allow lowering the perplexities or restoring them to
        their original maximum value. This restriction exists because setting a
        higher perplexity value requires recomputing all the nearest neighbors,
        which can take a long time. To avoid potential confusion as to why
        execution time is slow, this is not allowed. If you would like to
        increase the perplexity above the initial value, simply create a new
        instance.

        Parameters
        ----------
        new_perplexities: List[float]
            The new list of perplexities.

        """
        if np.array_equal(self.perplexities, new_perplexities):
            return

        effective_perplexities = self.check_perplexities(
            new_perplexities, self.n_samples)
        max_perplexity = np.max(effective_perplexities)
        k_neighbors = min(self.n_samples - 1, int(3 * max_perplexity))

        if k_neighbors > self.__neighbors.shape[1]:
            raise RuntimeError(
                "The largest perplexity `%.2f` is larger than the initial one "
                "used. This would need to recompute the nearest neighbors, "
                "which is not efficient. Please create a new `%s` instance "
                "with the increased perplexity." %
                (max_perplexity, self.__class__.__name__))

        self.perplexities = new_perplexities
        self.effective_perplexities_ = effective_perplexities
        with utils.Timer("Perplexity changed. Recomputing affinity matrix...",
                         self.verbose):
            self.P = self._calculate_P(
                self.__neighbors[:, :k_neighbors],
                self.__distances[:, :k_neighbors],
                self.effective_perplexities_,
                symmetrize=self.symmetrize,
                n_jobs=self.n_jobs,
            )
示例#16
0
    def query(self, query, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors in existing embedding using HNSWlib "
            f"approximate search...",
            self.verbose,
        )
        timer.__enter__()

        # Set ef parameter for (ideal) precision/recall
        self.index.set_ef(min(2 * k, self.index.get_current_count()))

        # Query for kNN
        indices, distances = self.index.knn_query(query, k=k, num_threads=self.n_jobs)

        # Stop timer
        timer.__exit__()

        # return indices and distances
        return indices, distances
示例#17
0
    def __init__(
        self,
        data,
        perplexity=30,
        method="auto",
        metric="euclidean",
        metric_params=None,
        symmetrize=True,
        n_jobs=1,
        random_state=None,
        verbose=False,
        k_neighbors="auto",
    ):
        self.n_samples = data.shape[0]

        if k_neighbors == "auto":
            _k_neighbors = min(self.n_samples - 1, int(3 * perplexity))
        else:
            _k_neighbors = k_neighbors

        self.perplexity = self.check_perplexity(perplexity, _k_neighbors)
        self.verbose = verbose

        if _k_neighbors > int(3 * self.perplexity):
            log.warning(
                "The k_neighbors value is over 3 times larger than the perplexity value. "
                "This may result in an unnecessary slowdown.")

        self.knn_index, self.__neighbors, self.__distances = build_knn_index(
            data, method, _k_neighbors, metric, metric_params, n_jobs,
            random_state, verbose)

        with utils.Timer("Calculating affinity matrix...", self.verbose):
            self.P = joint_probabilities_nn(
                self.__neighbors,
                self.__distances,
                [self.perplexity],
                symmetrize=symmetrize,
                n_jobs=n_jobs,
            )

        self.n_jobs = n_jobs
示例#18
0
    def query(self, query, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors in existing embedding using exact search...",
            self.verbose,
        )
        timer.__enter__()

        # The nearest neighbor ranking for cosine distance is the same as for
        # euclidean distance on normalized data
        if self.metric == "cosine":
            effective_data = query.copy()
            effective_data = (
                effective_data / np.linalg.norm(effective_data, axis=1)[:, None]
            )
        else:
            effective_data = query

        distances, indices = self.index.kneighbors(effective_data, n_neighbors=k)

        # If using cosine distance, the computed distances will be wrong and
        # need to be recomputed
        if self.metric == "cosine":
            if self.__data is None:
                raise RuntimeError(
                    "The original data was unavailable when querying cosine "
                    "distance. Did you change the distance metric after "
                    "building the index? Please rebuild the index using cosine "
                    "similarity."
                )
            distances = np.vstack(
                [
                    cdist(np.atleast_2d(x), self.__data[idx], metric="cosine")
                    for x, idx in zip(query, indices)
                ]
            )

        timer.__exit__()

        return indices, distances
    def build(self, data, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors using NN descent approximate search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        # These values were taken from UMAP, which we assume to be sensible defaults
        n_trees = 5 + int(round((data.shape[0])**0.5 / 20))
        n_iters = max(5, int(round(np.log2(data.shape[0]))))

        # Numba takes a while to load up, so there's little point in loading it
        # unless we're actually going to use it
        import pynndescent

        # UMAP uses the "alternative" algorithm, but that sometimes causes
        # memory corruption, so use the standard one, which seems to work fine
        self.index = pynndescent.NNDescent(
            data,
            n_neighbors=15,
            metric=self.metric,
            metric_kwds=self.metric_params,
            random_state=self.random_state,
            n_trees=n_trees,
            n_iters=n_iters,
            algorithm="standard",
            max_candidates=60,
            n_jobs=self.n_jobs,
        )

        indices, distances = self.index.query(data, k=k + 1)

        timer.__exit__()

        return indices[:, 1:], distances[:, 1:]
示例#20
0
def spectral(A,
             n_components=2,
             tol=1e-4,
             max_iter=None,
             random_state=None,
             verbose=False):
    """Initialize an embedding using the spectral embedding of the KNN graph.

    Specifically, we initialize data points by computing the diffusion map on
    the random walk transition matrix of the weighted graph given by the affiniy
    matrix.

    Parameters
    ----------
    A: Union[sp.csr_matrix, sp.csc_matrix, ...]
        The graph adjacency matrix.

    n_components: int
        The dimension of the embedding space.

    tol: float
        See scipy.sparse.linalg.eigsh documentation.

    max_iter: float
        See scipy.sparse.linalg.eigsh documentation.

    random_state: Any
        Unused, but kept for consistency between initialization schemes.

    verbose: bool

    Returns
    -------
    initialization: np.ndarray

    """
    if A.ndim != 2:
        raise ValueError(
            "The graph adjacency matrix must be a 2-dimensional matrix.")
    if A.shape[0] != A.shape[1]:
        raise ValueError("The graph adjacency matrix must be a square matrix.")

    timer = utils.Timer("Calculating spectral initialization...", verbose)
    timer.__enter__()

    D = sp.diags(np.ravel(np.sum(A, axis=1)))

    # Find leading eigenvectors
    k = n_components + 1
    v0 = np.ones(A.shape[0]) / np.sqrt(A.shape[0])
    eigvals, eigvecs = sp.linalg.eigsh(A,
                                       M=D,
                                       k=k,
                                       tol=tol,
                                       maxiter=max_iter,
                                       which="LM",
                                       v0=v0)
    # Sort the eigenvalues in decreasing order
    order = np.argsort(eigvals)[::-1]
    eigvecs = eigvecs[:, order]

    # In diffusion maps, we multiply the eigenvectors by their eigenvalues
    eigvecs *= eigvals

    # Drop the leading eigenvector
    embedding = eigvecs[:, 1:]

    rescale(embedding, inplace=True)

    timer.__exit__()

    return embedding
示例#21
0
    def __init__(
        self,
        data=None,
        perplexity=30,
        method="auto",
        metric="euclidean",
        metric_params=None,
        symmetrize=True,
        n_jobs=1,
        random_state=None,
        verbose=False,
        k_neighbors="auto",
        knn_index=None,
    ):
        # This can't work if neither data nor the knn index are specified
        if data is None and knn_index is None:
            raise ValueError(
                "At least one of the parameters `data` or `knn_index` must be specified!"
            )
        # This can't work if both data and the knn index are specified
        if data is not None and knn_index is not None:
            raise ValueError(
                "Both `data` or `knn_index` were specified! Please pass only one."
            )

        # Find the nearest neighbors
        if knn_index is None:
            n_samples = data.shape[0]

            if k_neighbors == "auto":
                _k_neighbors = min(n_samples - 1, int(3 * perplexity))
            else:
                _k_neighbors = k_neighbors

            effective_perplexity = self.check_perplexity(
                perplexity, _k_neighbors)
            if _k_neighbors > int(3 * effective_perplexity):
                log.warning(
                    "The k_neighbors value is over 3 times larger than the perplexity value. "
                    "This may result in an unnecessary slowdown.")

            self.knn_index = get_knn_index(data, method, _k_neighbors, metric,
                                           metric_params, n_jobs, random_state,
                                           verbose)

        else:
            self.knn_index = knn_index
            effective_perplexity = self.check_perplexity(
                perplexity, self.knn_index.k)
            log.info("KNN index provided. Ignoring KNN-related parameters.")

        self.__neighbors, self.__distances = self.knn_index.build()

        with utils.Timer("Calculating affinity matrix...", verbose):
            self.P = joint_probabilities_nn(
                self.__neighbors,
                self.__distances,
                [effective_perplexity],
                symmetrize=symmetrize,
                n_jobs=n_jobs,
            )

        self.perplexity = perplexity
        self.effective_perplexity_ = effective_perplexity
        self.symmetrize = symmetrize
        self.n_jobs = n_jobs
        self.verbose = verbose
示例#22
0
    def build(self, data, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors using NN descent approximate search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        # These values were taken from UMAP, which we assume to be sensible defaults
        n_trees = 5 + int(round((data.shape[0]) ** 0.5 / 20))
        n_iters = max(5, int(round(np.log2(data.shape[0]))))

        # Numba takes a while to load up, so there's little point in loading it
        # unless we're actually going to use it
        import pynndescent

        # Will use query() only for k>15
        if k <= 15:
            n_neighbors_build = k + 1
        else:
            n_neighbors_build = 15

        self.index = pynndescent.NNDescent(
            data,
            n_neighbors=n_neighbors_build,
            metric=self.metric,
            metric_kwds=self.metric_params,
            random_state=self.random_state,
            n_trees=n_trees,
            n_iters=n_iters,
            max_candidates=60,
            n_jobs=self.n_jobs,
            verbose=self.verbose > 1,
        )

        # -1 in indices means that pynndescent failed
        indices, distances = self.index.neighbor_graph
        mask = np.sum(indices == -1, axis=1) > 0

        if k > 15:
            indices, distances = self.index.query(data, k=k + 1)

        # As a workaround, we let the failed points group together
        if np.sum(mask) > 0:
            if self.verbose:
                opt = np.get_printoptions()
                np.set_printoptions(threshold=np.inf)
                warnings.warn(
                    f"`pynndescent` failed to find neighbors for some of the points. "
                    f"As a workaround, openTSNE considers all such points similar to "
                    f"each other, so they will likely form a cluster in the embedding."
                    f"The indices of the failed points are:\n{np.where(mask)[0]}"
                )
                np.set_printoptions(**opt)
            else:
                warnings.warn(
                    f"`pynndescent` failed to find neighbors for some of the points. "
                    f"As a workaround, openTSNE considers all such points similar to "
                    f"each other, so they will likely form a cluster in the embedding. "
                    f"Run with verbose=True, to see indices of the failed points."
                )
            distances[mask] = 1
            rs = check_random_state(self.random_state)
            fake_indices = rs.choice(
                np.sum(mask), size=np.sum(mask) * indices.shape[1], replace=True
            )
            fake_indices = np.where(mask)[0][fake_indices]
            indices[mask] = np.reshape(fake_indices, (np.sum(mask), indices.shape[1]))

        timer.__exit__()

        return indices[:, 1:], distances[:, 1:]
示例#23
0
    def __init__(
        self,
        data=None,
        sigma=None,
        k=30,
        method="auto",
        metric="euclidean",
        metric_params=None,
        symmetrize=True,
        n_jobs=1,
        random_state=None,
        verbose=False,
        knn_index=None,
    ):
        # Sigma must be specified, but has default set to none, so the parameter
        # order makes more sense
        if sigma is None:
            raise ValueError("`sigma` must be specified!")

        # This can't work if neither data nor the knn index are specified
        if data is None and knn_index is None:
            raise ValueError(
                "At least one of the parameters `data` or `knn_index` must be specified!"
            )
        # This can't work if both data and the knn index are specified
        if data is not None and knn_index is not None:
            raise ValueError(
                "Both `data` or `knn_index` were specified! Please pass only one."
            )

        # Find the nearest neighbors
        if knn_index is None:
            if k >= data.shape[0]:
                raise ValueError("`k` (%d) cannot be larger than N-1 (%d)." %
                                 (k, data.shape[0]))

            self.knn_index = get_knn_index(data, method, k, metric,
                                           metric_params, n_jobs, random_state,
                                           verbose)

        else:
            self.knn_index = knn_index
            log.info("KNN index provided. Ignoring KNN-related parameters.")

        neighbors, distances = self.knn_index.build()

        with utils.Timer("Calculating affinity matrix...", verbose):
            # Compute asymmetric pairwise input similarities
            conditional_P = np.exp(-(distances**2) / (2 * sigma**2))
            conditional_P /= np.sum(conditional_P, axis=1)[:, np.newaxis]

            n_samples = self.knn_index.n_samples
            P = sp.csr_matrix(
                (
                    conditional_P.ravel(),
                    neighbors.ravel(),
                    range(0, n_samples * k + 1, k),
                ),
                shape=(n_samples, n_samples),
            )

            # Symmetrize the probability matrix
            if symmetrize:
                P = (P + P.T) / 2

            # Convert weights to probabilities
            P /= np.sum(P)

        self.sigma = sigma
        self.P = P
        self.n_jobs = n_jobs
        self.verbose = verbose
示例#24
0
    def to_new(self, data, perplexities=None, return_distances=False):
        """Compute the affinities of new samples to the initial samples.

        This is necessary for embedding new data points into an existing
        embedding.

        Please see the :ref:`parameter-guide` for more information.

        Parameters
        ----------
        data: np.ndarray
            The data points to be added to the existing embedding.

        perplexities: List[float]
            A list of perplexity values, which will be used in the multiscale
            Gaussian kernel. Perplexity can be thought of as the continuous
            :math:`k` number of nearest neighbors, for which t-SNE will attempt
            to preserve distances.

        return_distances: bool
            If needed, the function can return the indices of the nearest
            neighbors and their corresponding distances.

        Returns
        -------
        P: array_like
            An :math:`N \\times M` affinity matrix expressing interactions
            between :math:`N` new data points the initial :math:`M` data
            samples.

        indices: np.ndarray
            Returned if ``return_distances=True``. The indices of the :math:`k`
            nearest neighbors in the existing embedding for every new data
            point.

        distances: np.ndarray
            Returned if ``return_distances=True``. The distances to the
            :math:`k` nearest neighbors in the existing embedding for every new
            data point.

        """
        perplexities = perplexities if perplexities is not None else self.perplexities
        effective_perplexities = self.check_perplexities(
            perplexities, self.n_samples)

        max_perplexity = np.max(effective_perplexities)
        k_neighbors = min(self.n_samples - 1, int(3 * max_perplexity))

        neighbors, distances = self.knn_index.query(data, k_neighbors)

        with utils.Timer("Calculating affinity matrix...", self.verbose):
            P = self._calculate_P(
                neighbors,
                distances,
                effective_perplexities,
                symmetrize=False,
                normalization="point-wise",
                n_reference_samples=self.n_samples,
                n_jobs=self.n_jobs,
            )

        if return_distances:
            return P, neighbors, distances

        return P
示例#25
0
    def to_new(self, data, k=None, sigma=None, return_distances=False):
        """Compute the affinities of new samples to the initial samples.

        This is necessary for embedding new data points into an existing
        embedding.

        Parameters
        ----------
        data: np.ndarray
            The data points to be added to the existing embedding.

        k: int
            The number of nearest neighbors to consider for each kernel.

        sigma: float
            The bandwidth to use for the Gaussian kernels in the ambient space.

        return_distances: bool
            If needed, the function can return the indices of the nearest
            neighbors and their corresponding distances.

        Returns
        -------
        P: array_like
            An :math:`N \\times M` affinity matrix expressing interactions
            between :math:`N` new data points the initial :math:`M` data
            samples.

        indices: np.ndarray
            Returned if ``return_distances=True``. The indices of the :math:`k`
            nearest neighbors in the existing embedding for every new data
            point.

        distances: np.ndarray
            Returned if ``return_distances=True``. The distances to the
            :math:`k` nearest neighbors in the existing embedding for every new
            data point.

        """
        n_samples = data.shape[0]
        n_reference_samples = self.n_samples

        if k is None:
            k = self.knn_index.k
        elif k >= n_reference_samples:
            raise ValueError(
                "`k` (%d) cannot be larger than the number of reference "
                "samples (%d)." % (k, self.n_samples))

        if sigma is None:
            sigma = self.sigma

        # Find nearest neighbors and the distances to the new points
        neighbors, distances = self.knn_index.query(data, k)

        with utils.Timer("Calculating affinity matrix...", self.verbose):
            # Compute asymmetric pairwise input similarities
            conditional_P = np.exp(-(distances**2) / (2 * sigma**2))

            # Convert weights to probabilities
            conditional_P /= np.sum(conditional_P, axis=1)[:, np.newaxis]

            P = sp.csr_matrix(
                (
                    conditional_P.ravel(),
                    neighbors.ravel(),
                    range(0, n_samples * k + 1, k),
                ),
                shape=(n_samples, n_reference_samples),
            )

        if return_distances:
            return P, neighbors, distances

        return P
示例#26
0
import gzip
import pickle
from os import path

import openTSNE
from openTSNE import utils

with utils.Timer("Loading data...", verbose=True):
    with gzip.open(path.join("data", "macosko_2015.pkl.gz"), "rb") as f:
        data = pickle.load(f)

x = data["pca_50"]
y, cluster_ids = data["CellType1"], data["CellType2"]

# import sys; sys.path.append("FIt-SNE")
# from fast_tsne import fast_tsne
#
# with Timer("Running fast_tsne..."):
#     fast_tsne(x, nthreads=1)

affinities = openTSNE.affinity.PerplexityBasedNN(
    x,
    perplexity=30,
    metric="cosine",
    method="approx",
    n_jobs=-1,
    random_state=0,
    verbose=True,
)

init = openTSNE.initialization.spectral(affinities.P, verbose=True)
示例#27
0
    def to_new(self,
               data,
               perplexity=None,
               return_distances=False,
               k_neighbors="auto"):
        """Compute the affinities of new samples to the initial samples.

        This is necessary for embedding new data points into an existing
        embedding.

        Please see the :ref:`parameter-guide` for more information.

        Parameters
        ----------
        data: np.ndarray
            The data points to be added to the existing embedding.

        perplexity: float
            Perplexity can be thought of as the continuous :math:`k` number of
            nearest neighbors, for which t-SNE will attempt to preserve
            distances.

        return_distances: bool
            If needed, the function can return the indices of the nearest
            neighbors and their corresponding distances.

        k_neighbors: int or ``auto``
            The number of neighbors to query kNN graph for. If ``auto``
            (default), it is set to three times the perplexity.

        Returns
        -------
        P: array_like
            An :math:`N \\times M` affinity matrix expressing interactions
            between :math:`N` new data points the initial :math:`M` data
            samples.

        indices: np.ndarray
            Returned if ``return_distances=True``. The indices of the :math:`k`
            nearest neighbors in the existing embedding for every new data
            point.

        distances: np.ndarray
            Returned if ``return_distances=True``. The distances to the
            :math:`k` nearest neighbors in the existing embedding for every new
            data point.

        """

        perplexity = perplexity if perplexity is not None else self.perplexity

        if k_neighbors == "auto":
            _k_neighbors = min(self.n_samples, int(3 * perplexity))
        else:
            _k_neighbors = k_neighbors

        effective_perplexity = self.check_perplexity(perplexity, _k_neighbors)

        neighbors, distances = self.knn_index.query(data, _k_neighbors)

        with utils.Timer("Calculating affinity matrix...", self.verbose):
            P = joint_probabilities_nn(
                neighbors,
                distances,
                [effective_perplexity],
                symmetrize=False,
                normalization="point-wise",
                n_reference_samples=self.n_samples,
                n_jobs=self.n_jobs,
            )

        if return_distances:
            return P, neighbors, distances

        return P