Пример #1
0
def _compute_connectivities_umap(
    knn_indices, knn_dists,
    n_obs, n_neighbors, set_op_mix_ratio=1.0,
    local_connectivity=1.0,
):
    """\
    This is from umap.fuzzy_simplicial_set [McInnes18]_.

    Given a set of data X, a neighborhood size, and a measure of distance
    compute the fuzzy simplicial set (here represented as a fuzzy graph in
    the form of a sparse matrix) associated to the data. This is done by
    locally approximating geodesic distance at each point, creating a fuzzy
    simplicial set for each such point, and then combining all the local
    fuzzy simplicial sets into a global one via a fuzzy union.
    """
    from umap.umap_ import fuzzy_simplicial_set

    X = coo_matrix(([], ([], [])), shape=(n_obs, 1))
    connectivities = fuzzy_simplicial_set(X, n_neighbors, None, None,
                                          knn_indices=knn_indices, knn_dists=knn_dists,
                                          set_op_mix_ratio=set_op_mix_ratio,
                                          local_connectivity=local_connectivity)

    if isinstance(connectivities, tuple):
        # In umap-learn 0.4, this returns (result, sigmas, rhos)
        connectivities = connectivities[0]

    distances = _get_sparse_matrix_from_indices_distances_umap(knn_indices, knn_dists, n_obs, n_neighbors)

    return distances, connectivities.tocsr()
Пример #2
0
def nearest_neighbor(features, k=15, sigma=3):
    from sklearn.neighbors import kneighbors_graph, NearestNeighbors
    from sklearn.metrics import pairwise_distances
    import random
    import networkx as nx
    from umap.umap_ import fuzzy_simplicial_set
    from scipy.sparse import coo_matrix

    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(features)
    knn_dists, knn_indices = nbrs.kneighbors(features)

    X = coo_matrix(([], ([], [])), shape=(features.shape[0], 1))

    connectivities = fuzzy_simplicial_set(
        X,
        n_neighbors=k,
        metric=None,
        random_state=None,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
        set_op_mix_ratio=1.0,
        local_connectivity=1.0,
    )

    if isinstance(connectivities, tuple):
        # In umap-learn 0.4, this returns (result, sigmas, rhos)
        connectivities = connectivities[0]
    connectivities = connectivities.toarray()
    G = nx.from_numpy_matrix(connectivities, create_using=nx.Graph)
    return connectivities, G
Пример #3
0
def compute_connectivities_umap(knn_indices,
                                knn_dists,
                                n_obs,
                                n_neighbors,
                                set_op_mix_ratio=1.0,
                                local_connectivity=1.0):
    '''
	Copied out of scanpy.neighbors
	
	This is from umap.fuzzy_simplicial_set [McInnes18]_.
	Given a set of data X, a neighborhood size, and a measure of distance
	compute the fuzzy simplicial set (here represented as a fuzzy graph in
	the form of a sparse matrix) associated to the data. This is done by
	locally approximating geodesic distance at each point, creating a fuzzy
	simplicial set for each such point, and then combining all the local
	fuzzy simplicial sets into a global one via a fuzzy union.
	'''
    X = coo_matrix(([], ([], [])), shape=(n_obs, 1))
    connectivities = fuzzy_simplicial_set(
        X,
        n_neighbors,
        None,
        None,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
        set_op_mix_ratio=set_op_mix_ratio,
        local_connectivity=local_connectivity)
    distances = get_sparse_matrix_from_indices_distances_umap(
        knn_indices, knn_dists, n_obs, n_neighbors)

    return distances, connectivities.tocsr()
Пример #4
0
def fit_umap(X, n_neighbors, metric, n_components=2):
    sparse_graph, sigmas, rhos = fuzzy_simplicial_set(
        X=X,
        random_state=check_random_state(0),
        n_neighbors=n_neighbors,
        metric=metric)

    a, b = find_ab_params(spread=1.0, min_dist=0.1)
    return simplicial_set_embedding(
        data=X,
        graph=sparse_graph,
        n_components=n_components,
        initial_alpha=1.0,
        a=a,
        b=b,
        gamma=1.0,
        negative_sample_rate=5,
        n_epochs=0,
        init=SPECTRAL_INIT,
        random_state=check_random_state(0),
        metric=metric,
        metric_kwds={},
        output_metric=dist.named_distances_with_gradients[EUCLIDEAN],
        output_metric_kwds={},
        euclidean_output=(metric == EUCLIDEAN),
        parallel=False,
        verbose=False,
    )
Пример #5
0
def _calculate_radii(X: np.ndarray,
                     n_neighbors: int = 30,
                     random_state: Optional[int] = None) -> np.ndarray:
    from umap.umap_ import fuzzy_simplicial_set
    from umap.umap_ import nearest_neighbors

    # directly taken from: https://github.com/lmcinnes/umap/blob/
    # 317ce81dc64aec9e279aa1374ac809d9ced236f6/umap/umap_.py#L1190-L1243
    (
        knn_indices,
        knn_dists,
        rp_forest,
    ) = nearest_neighbors(
        X,
        n_neighbors,
        "euclidean",
        {},
        False,
        random_state,
        verbose=False,
    )

    emb_graph, emb_sigmas, emb_rhos, emb_dists = fuzzy_simplicial_set(
        X,
        n_neighbors,
        random_state,
        "euclidean",
        {},
        knn_indices,
        knn_dists,
        verbose=False,
        return_dists=True,
    )

    emb_graph = emb_graph.tocoo()
    emb_graph.sum_duplicates()
    emb_graph.eliminate_zeros()

    n_vertices = emb_graph.shape[1]

    mu_sum = np.zeros(n_vertices, dtype=np.float32)
    re = np.zeros(n_vertices, dtype=np.float32)

    head = emb_graph.row
    tail = emb_graph.col
    for i in range(len(head)):
        j = head[i]
        k = tail[i]

        D = emb_dists[j, k]
        mu = emb_graph.data[i]

        re[j] += mu * D
        re[k] += mu * D
        mu_sum[j] += mu
        mu_sum[k] += mu

    epsilon = 1e-8
    return np.log(epsilon + (re / mu_sum))
Пример #6
0
def test_nn_search():
    train = nn_data[100:]
    test = nn_data[:100]
    (knn_indices, knn_dists,
     rp_forest) = nearest_neighbors(train, 10, "euclidean", {}, False,
                                    np.random)

    graph = fuzzy_simplicial_set(
        nn_data,
        10,
        np.random,
        "euclidean",
        {},
        knn_indices,
        knn_dists,
        False,
        1.0,
        1.0,
        False,
    )

    search_graph = sparse.lil_matrix((train.shape[0], train.shape[0]),
                                     dtype=np.int8)
    search_graph.rows = knn_indices
    search_graph.data = (knn_dists != 0).astype(np.int8)
    search_graph = search_graph.maximum(search_graph.transpose()).tocsr()

    random_init, tree_init = make_initialisations(dist.euclidean, ())
    search = make_initialized_nnd_search(dist.euclidean, ())

    rng_state = np.random.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)
    init = initialise_search(rp_forest, train, test, int(10 * 3), random_init,
                             tree_init, rng_state)
    result = search(train, search_graph.indptr, search_graph.indices, init,
                    test)

    indices, dists = deheap_sort(result)
    indices = indices[:, :10]

    tree = KDTree(train)
    true_indices = tree.query(test, 10, return_distance=False)

    num_correct = 0.0
    for i in range(test.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], indices[i]))

    percent_correct = num_correct / (test.shape[0] * 10)
    assert_greater_equal(
        percent_correct,
        0.99,
        "Sparse NN-descent did not get "
        "99% accuracy on nearest "
        "neighbors",
    )
Пример #7
0
    def transform(self):

        self.data_, self._sigmas, self._rhos = umaplib.fuzzy_simplicial_set(
            X=self.data,
            n_neighbors=self.n_neighbors,
            random_state=self.random_state,
            metric=self.metric,
            angular=self.angular,
            set_op_mix_ratio=self.set_op_mix_ratio,
            local_connectivity=self.local_connectivity,
        )
Пример #8
0
def fit_lumap(X, n_neighbors, metric, n_components=2):
    """
    Build the fuzzy simplices UMAP-style (via fuzzy unions of local metric spaces) and then
        fit the matrix Laplacian Eigenmaps style (via graph laplacian)  
    """
    sparse_graph, sigmas, rhos = fuzzy_simplicial_set(
        X=X,
        random_state=check_random_state(0),
        n_neighbors=n_neighbors,
        metric=metric)
    return spectral_embedding(sparse_graph, n_components=n_components)
def build_fuzzy_simplicial_set(X, y=None, n_neighbors=15):
    """
    Build nearest neighbor graph, then fuzzy simplicial set

    Parameters
    ----------
    X : [type]
        [description]
    n_neighbors : int, optional
        [description], by default 15
    """
    n_trees = 5 + int(round((X.shape[0])**0.5 / 20.0))
    n_iters = max(5, int(round(np.log2(X.shape[0]))))

    # get nearest neighbors
    nnd = NNDescent(
        X,
        n_neighbors=n_neighbors,
        metric="euclidean",
        n_trees=n_trees,
        n_iters=n_iters,
        max_candidates=60,
    )

    # get indices and distances
    knn_indices, knn_dists = nnd.neighbor_graph

    random_state = check_random_state(None)
    # build graph
    umap_graph, sigmas, rhos = fuzzy_simplicial_set(
        X=X,
        n_neighbors=n_neighbors,
        metric="euclidean",
        random_state=random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )

    if y is not None:
        # set far_dist based on the assumption that target_weight == 1
        far_dist = 1.0e12
        y_ = check_array(y, ensure_2d=False)
        umap_graph = discrete_metric_simplicial_set_intersection(
            umap_graph, y_, far_dist=far_dist)

    return umap_graph
Пример #10
0
def umap_conn_indices_dist_embedding(X,
                                     n_neighbors=15,
                                     n_components=2,
                                     metric="euclidean",
                                     min_dist=0.1,
                                     random_state=0,
                                     verbose=False):
    """Compute connectivity graph, matrices for kNN neighbor indices, distance and low dimension embedding with UMAP.
    This code is adapted from umap-learn (https://github.com/lmcinnes/umap/blob/97d33f57459de796774ab2d7fcf73c639835676d/umap/umap_.py)

    Arguments
    ---------
    X: sparse matrix (`.X`, dtype `float32`)
        expression matrix (n_cell x n_genes)
    n_neighbors: 'int' (optional, default 15)
        The number of nearest neighbors to compute for each sample in ``X``.
    n_components: 'int' (optional, default 2)
        The dimension of the space to embed into.
    metric: 'str' or `callable` (optional, default cosine)
        The metric to use for the computation.
    min_dist: 'float' (optional, default 0.1)
        The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped
        embedding where nearby points on the manifold are drawn closer together, while larger values will result on a
        more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the
        scale at which embedded points will be spread out.
    random_state: `int`, `RandomState` instance or `None`, optional (default: None)
        If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is
        the random number generator; If None, the random number generator is the RandomState instance used by `numpy.random`.
    verbose: `bool` (optional, default False)
        Controls verbosity of logging.

    Returns
    -------
    Returns an updated `adata` with reduced dimension data for spliced counts, projected future transcript counts 'Y_dim' and adjacency matrix when possible.
    """

    from sklearn.utils import check_random_state
    from sklearn.metrics import pairwise_distances
    from umap.umap_ import nearest_neighbors, fuzzy_simplicial_set, simplicial_set_embedding, find_ab_params

    import umap.sparse as sparse
    import umap.distances as dist

    from umap.utils import tau_rand_int, deheap_sort
    from umap.rp_tree import rptree_leaf_array, make_forest
    # https://github.com/lmcinnes/umap/blob/97d33f57459de796774ab2d7fcf73c639835676d/umap/nndescent.py
    from umap.nndescent import (
        make_nn_descent,
        make_initialisations,
        make_initialized_nnd_search,
        initialise_search,
    )
    from umap.spectral import spectral_layout

    random_state = check_random_state(42)

    _raw_data = X

    if X.shape[0] < 4096:  #1
        dmat = pairwise_distances(X, metric=metric)
        graph = fuzzy_simplicial_set(X=dmat,
                                     n_neighbors=n_neighbors,
                                     random_state=random_state,
                                     metric="precomputed",
                                     verbose=verbose)
        # extract knn_indices, knn_dist
        g_tmp = deepcopy(graph)
        g_tmp[graph.nonzero()] = dmat[graph.nonzero()]
        knn_indices, knn_dists = extract_indices_dist_from_graph(
            g_tmp, n_neighbors=n_neighbors)

    else:
        # Standard case
        (knn_indices, knn_dists,
         rp_forest) = nearest_neighbors(X=X,
                                        n_neighbors=n_neighbors,
                                        metric=metric,
                                        metric_kwds={},
                                        angular=False,
                                        random_state=random_state,
                                        verbose=verbose)

        graph = fuzzy_simplicial_set(X=X,
                                     n_neighbors=n_neighbors,
                                     random_state=random_state,
                                     metric=metric,
                                     knn_indices=knn_indices,
                                     knn_dists=knn_dists,
                                     angular=rp_forest,
                                     verbose=verbose)

        _raw_data = X
        _transform_available = True
        _search_graph = scipy.sparse.lil_matrix((X.shape[0], X.shape[0]),
                                                dtype=np.int8)
        _search_graph.rows = knn_indices  # An array (self.rows) of rows, each of which is a sorted list of column indices of non-zero elements.
        _search_graph.data = (knn_dists != 0).astype(
            np.int8
        )  # The corresponding nonzero values are stored in similar fashion in self.data.
        _search_graph = _search_graph.maximum(  # Element-wise maximum between this and another matrix.
            _search_graph.transpose()).tocsr()

    if verbose:
        print("Construct embedding")

    a, b = find_ab_params(1, min_dist)

    embedding_ = simplicial_set_embedding(
        data=_raw_data,
        graph=graph,
        n_components=n_components,
        initial_alpha=1.0,  # learning_rate
        a=a,
        b=b,
        gamma=1.0,
        negative_sample_rate=5,
        n_epochs=0,
        init="spectral",
        random_state=random_state,
        metric=metric,
        metric_kwds={},
        verbose=verbose)

    return graph, knn_indices, knn_dists, embedding_
Пример #11
0
def cluster(data : Union[np.ndarray, pd.DataFrame], *, leiden : bool = True, resolution : Number = 2, verbose : bool = False, **kwargs) -> np.ndarray:
	"""Returns Leiden or Louvain clustering labels of the rows in the given data
	Uses PCA and UMAP to find neighbors

	Parameters
	----------
	data : np.ndarray, pd.DataFrame
		the values to cluster
		rows are individual points
		columns are values
	*
	leiden : bool = True
		whether to default to the Leiden algorithm if installed
		ignored if module `leidenalg` is not installed
	resolution : Number = 2
		the density limit that defines clusters
		all clusters are guaranteed to have density >= resolution
		only applies if using Leiden
	verbose : bool = False
		Whether or not to print what's happening
	**kwargs
		passed variously to sklearn.decomposition.PCA, umap.umap_.fuzzy_simplicial_set, leidenalg.find_partition
		extra kwargs ignored silently

	Returns
	-------
	np.ndarray (data.shape[0],)
		the cluster membership for each row

	Selected kwargs
	---------------
	n_components : int = 50
		the number of components to reduce to in PCA
	n_neighbors : int = sqrt(data.shape[0]).astype(int)
		the size of the local neighborhood
	metric : str = 'euclidean'
		the metric used to calculate distance in the high dimensional space
		many common metrics are predefined: eg. 'euclidean', 'manhattan', 'chebyshev', 'correlation'
	n_iterations : int =  -1
		number of iterations to run the Leiden algorithm
		if -1, runs until no improvement	
	seed : int = None
		seed for Leiden algorithm random number generator
		if None, leidenalg uses a random seed by default

	umap_random_state : always passed to UMAP
	pca_random_state : always passed to PCA
	random_state 
		passed to UMAP through sklearn.utils.check_random_state, but overridden by umap_random_state
		default None
	"""
	if verbose: print('Validating...')
	if isinstance(data, pd.DataFrame): data = data.values
	elif not isinstance(data, np.ndarray): raise TypeError('data must be an np.ndarray or pd.DataFrame')
	if not isinstance(resolution, (int,float)): raise TypeError('resolution must be a positive float')
	elif resolution < 0: raise ValueError('resolution must be a positive float')

	if leiden:
		if verbose: print('Trying leidenalg import')
		try: import leidenalg
		except ImportError:
			leiden = False # don't try it later
			warn('Using Louvain as leidenalg is not installed')

	LEIDEN_KWARGS = ['initial_membership', 'n_iterations', 'seed', 'node_sizes']

	if 'n_components' not in kwargs: kwargs['n_components'] = 50
	if 'n_neighbors' not in kwargs: kwargs['n_neighbors'] = np.sqrt(data.shape[0]).astype(int)
	if 'metric' not in kwargs: kwargs['metric'] = 'euclidean'
	if 'n_iterations' not in kwargs: kwargs['n_iterations'] = -1

	if 'umap_random_state' not in kwargs: 
		if 'random_state' in kwargs: kwargs['umap_random_state'] = check_random_state(kwargs.pop('random_state'))
		else: kwargs['umap_random_state'] = check_random_state(None)
	if 'pca_random_state' in kwargs: kwargs['random_state'] = kwargs['pca_random_state']


	if verbose: print('Training PCA...')
	pc = PCA(**{k:kwargs[k] for k in PCA_KWARGS if k in kwargs}).fit_transform(data)

	if verbose: print('Calculating distances...')
	kwargs['random_state'] = kwargs.pop('umap_random_state') # must be there
	del kwargs['n_components']
	adj = fuzzy_simplicial_set(pc, **{k:kwargs[k] for k in UMAP_KWARGS if k in kwargs})

	sources, targets = adj.nonzero()
	g = Graph(directed=leiden) # undirected for Louvain
	g.add_vertices(adj.shape[0])  # this adds adj.shape[0] vertices
	g.add_edges(list(zip(sources, targets)))

	if verbose: print('Clustering...')
	if leiden: # now guaranteed to work
		part = leidenalg.find_partition(g, leidenalg.RBConfigurationVertexPartition, resolution_parameter=resolution,
										weights=adj[sources, targets].A1, **{k:kwargs[k] for k in LEIDEN_KWARGS if k in kwargs})
	else:
		part = g.community_multilevel(weights=adj[sources, targets].A1)

	# print(part.membership)
	return np.array(part.membership)
Пример #12
0
def umap_conn_indices_dist_embedding(
    X,
    n_neighbors=30,
    n_components=2,
    metric="euclidean",
    min_dist=0.1,
    spread=1.0,
    n_epochs=0,
    alpha=1.0,
    gamma=1.0,
    negative_sample_rate=5,
    init_pos="spectral",
    random_state=0,
    densmap=False,
    dens_lambda=2.0,
    dens_frac=0.3,
    dens_var_shift=0.1,
    output_dens=False,
    return_mapper=True,
    verbose=False,
    **umap_kwargs,
):
    """Compute connectivity graph, matrices for kNN neighbor indices, distance matrix and low dimension embedding with UMAP.
    This code is adapted from umap-learn (https://github.com/lmcinnes/umap/blob/97d33f57459de796774ab2d7fcf73c639835676d/umap/umap_.py)

    Arguments
    ---------
        X: sparse matrix (`.X`, dtype `float32`)
            expression matrix (n_cell x n_genes)
        n_neighbors: 'int' (optional, default 15)
            The number of nearest neighbors to compute for each sample in ``X``.
        n_components: 'int' (optional, default 2)
            The dimension of the space to embed into.
        metric: 'str' or `callable` (optional, default `cosine`)
            The metric to use for the computation.
        min_dist: 'float' (optional, default `0.1`)
            The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped
            embedding where nearby points on the manifold are drawn closer together, while larger values will result on a
            more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the
            scale at which embedded points will be spread out.
        spread: `float` (optional, default 1.0)
            The effective scale of embedded points. In combination with min_dist this determines how clustered/clumped the
            embedded points are.
        n_epochs: 'int' (optional, default 0)
            The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in
            more accurate embeddings. If None is specified a value will be selected based on the size of the input dataset
            (200 for large datasets, 500 for small).
        alpha: `float` (optional, default 1.0)
            Initial learning rate for the SGD.
        gamma: `float` (optional, default 1.0)
            Weight to apply to negative samples. Values higher than one will result in greater weight being given to
            negative samples.
        negative_sample_rate: `float` (optional, default 5)
            The number of negative samples to select per positive sample in the optimization process. Increasing this value
             will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
             The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low
             dimensional embedding.
        init_pos: 'spectral':
            How to initialize the low dimensional embedding. Use a spectral embedding of the fuzzy 1-skeleton
        random_state: `int`, `RandomState` instance or `None`, optional (default: None)
            If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is
            the random number generator; If None, the random number generator is the RandomState instance used by `numpy.random`.
        dens_lambda: float (optional, default 2.0)
            Controls the regularization weight of the density correlation term
            in densMAP. Higher values prioritize density preservation over the
            UMAP objective, and vice versa for values closer to zero. Setting this
            parameter to zero is equivalent to running the original UMAP algorithm.
        dens_frac: float (optional, default 0.3)
            Controls the fraction of epochs (between 0 and 1) where the
            density-augmented objective is used in densMAP. The first
            (1 - dens_frac) fraction of epochs optimize the original UMAP objective
            before introducing the density correlation term.
        dens_var_shift: float (optional, default 0.1)
            A small constant added to the variance of local radii in the
            embedding when calculating the density correlation objective to
            prevent numerical instability from dividing by a small number
        output_dens: float (optional, default False)
            Determines whether the local radii of the final embedding (an inverse
            measure of local density) are computed and returned in addition to
            the embedding. If set to True, local radii of the original data
            are also included in the output for comparison; the output is a tuple
            (embedding, original local radii, embedding local radii). This option
            can also be used when densmap=False to calculate the densities for
            UMAP embeddings.
        verbose: `bool` (optional, default False)
                Controls verbosity of logging.

    Returns
    -------
        graph, knn_indices, knn_dists, embedding_
            A tuple of kNN graph (`graph`), indices of nearest neighbors of each cell (knn_indicies), distances of nearest
            neighbors (knn_dists) and finally the low dimensional embedding (embedding_).
    """

    from sklearn.utils import check_random_state
    from sklearn.metrics import pairwise_distances
    from umap.umap_ import (
        nearest_neighbors,
        fuzzy_simplicial_set,
        simplicial_set_embedding,
        find_ab_params,
    )

    random_state = check_random_state(random_state)

    _raw_data = X

    if X.shape[0] < 4096:  # 1
        dmat = pairwise_distances(X, metric=metric)
        graph = fuzzy_simplicial_set(
            X=dmat,
            n_neighbors=n_neighbors,
            random_state=random_state,
            metric="precomputed",
            verbose=verbose,
        )
        if type(graph) == tuple:
            graph = graph[0]

        # extract knn_indices, knn_dist
        g_tmp = deepcopy(graph)
        g_tmp[graph.nonzero()] = dmat[graph.nonzero()]
        knn_indices, knn_dists = adj_to_knn(g_tmp, n_neighbors=n_neighbors)
    else:
        # Standard case
        (knn_indices, knn_dists, rp_forest) = nearest_neighbors(
            X=X,
            n_neighbors=n_neighbors,
            metric=metric,
            metric_kwds={},
            angular=False,
            random_state=random_state,
            verbose=verbose,
        )

        graph = fuzzy_simplicial_set(
            X=X,
            n_neighbors=n_neighbors,
            random_state=random_state,
            metric=metric,
            knn_indices=knn_indices,
            knn_dists=knn_dists,
            angular=rp_forest,
            verbose=verbose,
        )

        _raw_data = X
        _transform_available = True
        # The corresponding nonzero values are stored in similar fashion in self.data.
        _search_graph, _ = get_conn_dist_graph(knn_indices, knn_dists)
        _search_graph = _search_graph.maximum(  # Element-wise maximum between this and another matrix.
            _search_graph.transpose()
        ).tocsr()

    if verbose:
        print("Construct embedding")

    a, b = find_ab_params(spread, min_dist)
    if type(graph) == tuple:
        graph = graph[0]

    dens_lambda = dens_lambda if densmap else 0.0
    dens_frac = dens_frac if densmap else 0.0

    if dens_lambda < 0.0:
        raise ValueError("dens_lambda cannot be negative")
    if dens_frac < 0.0 or dens_frac > 1.0:
        raise ValueError("dens_frac must be between 0.0 and 1.0")
    if dens_var_shift < 0.0:
        raise ValueError("dens_var_shift cannot be negative")

    densmap_kwds = {
        "lambda": dens_lambda,
        "frac": dens_frac,
        "var_shift": dens_var_shift,
        "n_neighbors": n_neighbors,
    }
    embedding_, aux_data = simplicial_set_embedding(
        data=_raw_data,
        graph=graph,
        n_components=n_components,
        initial_alpha=alpha,  # learning_rate
        a=a,
        b=b,
        gamma=gamma,
        negative_sample_rate=negative_sample_rate,
        n_epochs=n_epochs,
        init=init_pos,
        random_state=random_state,
        metric=metric,
        metric_kwds={},
        verbose=verbose,
        densmap=densmap,
        densmap_kwds=densmap_kwds,
        output_dens=output_dens,
    )

    if return_mapper:
        import umap
        from .utils import update_dict

        if n_epochs == 0:
            n_epochs = None

        _umap_kwargs = {
            "angular_rp_forest": False,
            "local_connectivity": 1.0,
            "metric_kwds": None,
            "set_op_mix_ratio": 1.0,
            "target_metric": "categorical",
            "target_metric_kwds": None,
            "target_n_neighbors": -1,
            "target_weight": 0.5,
            "transform_queue_size": 4.0,
            "transform_seed": 42,
        }
        umap_kwargs = update_dict(_umap_kwargs, umap_kwargs)

        mapper = umap.UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            metric=metric,
            min_dist=min_dist,
            spread=spread,
            n_epochs=n_epochs,
            learning_rate=alpha,
            repulsion_strength=gamma,
            negative_sample_rate=negative_sample_rate,
            init=init_pos,
            random_state=random_state,
            verbose=verbose,
            **umap_kwargs,
        ).fit(X)

        return mapper, graph, knn_indices, knn_dists, embedding_
    else:
        return graph, knn_indices, knn_dists, embedding_
Пример #13
0
def main():

    random.seed(rd.seed)
    umap_time = time.time()
    umap = UMAP(n_components=rd.n_dims, random_state=rd.seed).fit(rd.data)
    umap_time = time.time() - umap_time

    global MAX_E
    global MIN_E
    MAX_E = np.amax(umap.embedding_.T, 1)
    MIN_E = np.amin(umap.embedding_.T, 1)

    global v
    v = fuzzy_simplicial_set(rd.data, rd.nearest_neighbors,
                             np.random.RandomState(rd.seed),
                             "euclidean")[0].todense()
    print("UMAP Embedding Cost: {}".format(umap_cost(umap.embedding_, v)))

    print(rd.labels)

    num_classes = len(set(rd.labels))
    print("%d classes found." % num_classes)
    # distance_vector = pairwise_distances(rd.data)

    pset = gp.PrimitiveSet("MAIN", rd.num_features, prefix="f")
    pset.context["array"] = np.array
    REP.init_primitives(pset, rd.use_ercs)

    creator.create("FitnessMin", base.Fitness, weights=(-1.0, ))

    # set up toolbox
    toolbox = base.Toolbox()
    init_toolbox(toolbox, pset, rd.n_dims)

    toolbox.register("evaluate",
                     evaluate,
                     toolbox=toolbox,
                     data=rd.data,
                     metric=rd.measure,
                     embedding=umap.embedding_)

    pop = toolbox.population(n=rd.pop)
    hof = tools.HallOfFame(1)

    stats = init_stats()

    gp_time = time.time()
    pop, logbook = eaSimple(pop,
                            toolbox,
                            CXPB,
                            MUTPB,
                            ELITISM,
                            rd.gens,
                            stats,
                            halloffame=hof,
                            verbose=True)
    gp_time = time.time() - gp_time

    # TODO: re-implement outputting of run data

    for chapter in logbook.chapters:
        logbook_df = pd.DataFrame(logbook.chapters[chapter])
        logbook_df.to_csv("{}/{}_{}.csv".format(rd.outdir, chapter, rd.seed),
                          index=False)

    best = hof[0]
    res = final_evaluation(best, rd.data, rd.labels, umap, toolbox, gp_time,
                           umap_time)
    # evaluate(best, toolbox, data, num_classes, 'silhouette_pre', distance_vector=distance_vector,
    #          plot_sil=True)

    best_embedding = REP.process_data(best, toolbox, rd.data)
    write_embedding_to_file(best_embedding)

    write_ind_to_file(best, rd.seed, res)

    return pop, stats, hof
Пример #14
0
    def fit(self, data, callback):
        encoder = self.network
        batch_size = self.batch_size

        device = self.device
        print('Device:', device)

        ua, ub = find_ab_params(SPREAD, MIN_DIST)
        print('a:', ua, 'b:', ub)

        print('calc V')
        V_csc = fuzzy_simplicial_set(data,
                                     n_neighbors=15,
                                     random_state=np.random.RandomState(42),
                                     metric='euclidean')

        print('Make Graph')
        graph, epochs_per_sample, epochs_per_negative_sample = make_epochs_per_sample_from_P(
            V_csc, self.n_epochs, self.neg_rate)
        epoch_of_next_negative_sample = epochs_per_negative_sample.copy()
        epoch_of_next_sample = epochs_per_sample.copy()
        head = graph.row
        tail = graph.col

        print('Trying to put X into GPU')
        X = torch.from_numpy(data).float()
        X = X.to(device)
        self.X = X

        init_lr = 1e-3
        encoder = encoder.to(device)
        optimizer = optim.RMSprop(encoder.parameters(),
                                  lr=init_lr,
                                  weight_decay=0)

        rnd_max_idx = X.shape[0]
        print('optimizing...')
        grad_log = []
        rgrad_log = []
        for epoch in range(1, self.n_epochs):

            batch_i = []
            batch_j = []

            batch_neg_i = []
            for i in range(epochs_per_sample.shape[0]):
                if epoch_of_next_sample[i] <= epoch:
                    i_idx, j_idx = head[i], tail[i]
                    batch_i.append(i_idx)
                    batch_j.append(j_idx)

                    epoch_of_next_sample[i] += epochs_per_sample[i]

                    n_neg_samples = int(
                        (epoch - epoch_of_next_negative_sample[i]) /
                        epochs_per_negative_sample[i])
                    for _ in range(n_neg_samples):
                        batch_neg_i.append(i_idx)

                    epoch_of_next_negative_sample[i] += (
                        n_neg_samples * epochs_per_negative_sample[i])
            batch_neg_j = torch.randint(0, rnd_max_idx,
                                        (len(batch_neg_i), )).tolist()
            batch_r = torch.zeros(
                len(batch_i), dtype=torch.long).tolist() + torch.ones(
                    len(batch_neg_i), dtype=torch.long).tolist()

            batch_i += batch_neg_i
            batch_j += batch_neg_j

            rnd_perm = torch.randperm(len(batch_i))
            batch_i = torch.Tensor(batch_i).long()[rnd_perm]
            batch_j = torch.Tensor(batch_j).long()[rnd_perm]
            batch_r = torch.Tensor(batch_r).long()[rnd_perm]

            loss_total = []
            update_time = []

            for i in range(0, len(batch_i), batch_size):
                start_time = timeit.default_timer()
                bi = batch_i[i:i + batch_size]
                bj = batch_j[i:i + batch_size]
                br = batch_r[i:i + batch_size]

                optimizer.zero_grad()

                Y_bi = encoder(X[bi])
                Y_bj = encoder(X[bj])
                Y_bj[br == 1] = Y_bj[br == 1].detach()

                d = (Y_bi - Y_bj).pow(2).sum(dim=1)

                def reject_outliers(data, m=2):
                    return data[(data - (data.mean())).abs() < m *
                                (data.std())]

                def hook(grad):
                    grad_clamp = grad.clamp(min=-D_GRAD_CLIP, max=D_GRAD_CLIP)
                    abs_grad = grad_clamp.clone().abs()
                    rgrad = reject_outliers(abs_grad)
                    grad_log.append([
                        abs_grad.max(),
                        abs_grad.min(),
                        abs_grad.mean(),
                        abs_grad.std()
                    ])
                    return grad_clamp

                d.register_hook(hook)
                dp = d.pow(ub)
                w = (1 / (1 + ua * (dp))).clamp(min=0, max=1)

                pw = w[br == 0]
                rw = w[br == 1]
                loss = -(torch.log(pw + EPS)).sum()
                loss += -(torch.log(1 - rw + EPS)).sum()
                loss.backward()
                loss_total.append(loss.item() / len(bi))

                torch.nn.utils.clip_grad_value_(encoder.parameters(), 4)
                optimizer.step()

                elapsed = timeit.default_timer() - start_time
                update_time.append(elapsed)

            new_lr = (1 - epoch / self.n_epochs) * init_lr
            for param_group in optimizer.param_groups:
                param_group['lr'] = new_lr

            callback(self, np.mean(update_time), epoch, np.mean(loss_total))
Пример #15
0
    def fit(self, X, y=None):
        """Generate graph to fit X into an embedded space.
        Optionally use y for supervised dimension reduction.
        Parameters
        ----------
        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
            If the metric is 'precomputed' X must be a square distance
            matrix. Otherwise it contains a sample per row. If the method
            is 'exact', X may be a sparse matrix of type 'csr', 'csc'
            or 'coo'.
        y : array, shape (n_samples)
            A target array for supervised dimension reduction. How this is
            handled is determined by parameters UMAP was instantiated with.
            The relevant attributes are ``target_metric`` and
            ``target_metric_kwds``.
        """

        X = check_array(X, dtype=np.float32, accept_sparse="csr", order="C")
        self._raw_data = X

        # Handle all the optional arguments, setting default
        if self.a is None or self.b is None:
            self._a, self._b = find_ab_params(self.spread, self.min_dist)
        else:
            self._a = self.a
            self._b = self.b

        if isinstance(self.init, np.ndarray):
            init = check_array(self.init,
                               dtype=np.float32,
                               accept_sparse=False)
        else:
            init = self.init

        self._initial_alpha = self.learning_rate

        self._validate_parameters()

        if self.verbose:
            print(str(self))

        self._original_n_threads = numba.get_num_threads()
        if self.n_jobs > 0 and self.njobs is not None:
            numba.set_num_threads(self.n_jobs)

        # Check if we should unique the data
        # We've already ensured that we aren't in the precomputed case
        if self.unique:
            # check if the matrix is dense
            if self._sparse_data:
                # Call a sparse unique function
                index, inverse, counts = csr_unique(X)
            else:
                index, inverse, counts = np.unique(
                    X,
                    return_index=True,
                    return_inverse=True,
                    return_counts=True,
                    axis=0,
                )[1:4]
            if self.verbose:
                print(
                    "Unique=True -> Number of data points reduced from ",
                    X.shape[0],
                    " to ",
                    X[index].shape[0],
                )
                most_common = np.argmax(counts)
                print(
                    "Most common duplicate is",
                    index[most_common],
                    " with a count of ",
                    counts[most_common],
                )
        # If we aren't asking for unique use the full index.
        # This will save special cases later.
        else:
            index = list(range(X.shape[0]))
            inverse = list(range(X.shape[0]))

        # Error check n_neighbors based on data size
        if X[index].shape[0] <= self.n_neighbors:
            if X[index].shape[0] == 1:
                self.embedding_ = np.zeros(
                    (1, self.n_components))  # needed to sklearn comparability
                return self

            warn("n_neighbors is larger than the dataset size; truncating to "
                 "X.shape[0] - 1")
            self._n_neighbors = X[index].shape[0] - 1
            if self.densmap:
                self._densmap_kwds["n_neighbors"] = self._n_neighbors
        else:
            self._n_neighbors = self.n_neighbors

        # Note: unless it causes issues for setting 'index', could move this to
        # initial sparsity check above
        if self._sparse_data and not X.has_sorted_indices:
            X.sort_indices()

        random_state = check_random_state(self.random_state)

        if self.verbose:
            print("Construct fuzzy simplicial set")

        if self.metric == "precomputed" and self._sparse_data:
            # For sparse precomputed distance matrices, we just argsort the rows to find
            # nearest neighbors. To make this easier, we expect matrices that are
            # symmetrical (so we can find neighbors by looking at rows in isolation,
            # rather than also having to consider that sample's column too).
            print("Computing KNNs for sparse precomputed distances...")
            if sparse_tril(X).getnnz() != sparse_triu(X).getnnz():
                raise ValueError(
                    "Sparse precomputed distance matrices should be symmetrical!"
                )
            if not np.all(X.diagonal() == 0):
                raise ValueError(
                    "Non-zero distances from samples to themselves!")
            self._knn_indices = np.zeros((X.shape[0], self.n_neighbors),
                                         dtype=np.int)
            self._knn_dists = np.zeros(self._knn_indices.shape, dtype=np.float)
            for row_id in range(X.shape[0]):
                # Find KNNs row-by-row
                row_data = X[row_id].data
                row_indices = X[row_id].indices
                if len(row_data) < self._n_neighbors:
                    raise ValueError(
                        "Some rows contain fewer than n_neighbors distances!")
                row_nn_data_indices = np.argsort(row_data)[:self._n_neighbors]
                self._knn_indices[row_id] = row_indices[row_nn_data_indices]
                self._knn_dists[row_id] = row_data[row_nn_data_indices]
            (
                self.graph_,
                self._sigmas,
                self._rhos,
                self.graph_dists_,
            ) = fuzzy_simplicial_set(
                X[index],
                self.n_neighbors,
                random_state,
                "precomputed",
                self._metric_kwds,
                self._knn_indices,
                self._knn_dists,
                self.angular_rp_forest,
                self.set_op_mix_ratio,
                self.local_connectivity,
                True,
                self.verbose,
                self.densmap or self.output_dens,
            )
        # Handle small cases efficiently by computing all distances
        elif X[index].shape[
                0] < 4096 and not self.force_approximation_algorithm:
            self._small_data = True
            try:
                # sklearn pairwise_distances fails for callable metric on sparse data
                _m = self.metric if self._sparse_data else self._input_distance_func
                dmat = pairwise_distances(X[index],
                                          metric=_m,
                                          **self._metric_kwds)
            except (ValueError, TypeError) as e:
                # metric is numba.jit'd or not supported by sklearn,
                # fallback to pairwise special

                if self._sparse_data:
                    # Get a fresh metric since we are casting to dense
                    if not callable(self.metric):
                        _m = dist.named_distances[self.metric]
                        dmat = dist.pairwise_special_metric(
                            X[index].toarray(),
                            metric=_m,
                            kwds=self._metric_kwds,
                        )
                    else:
                        dmat = dist.pairwise_special_metric(
                            X[index],
                            metric=self._input_distance_func,
                            kwds=self._metric_kwds,
                        )
                else:
                    dmat = dist.pairwise_special_metric(
                        X[index],
                        metric=self._input_distance_func,
                        kwds=self._metric_kwds,
                    )
            (
                self.graph_,
                self._sigmas,
                self._rhos,
                self.graph_dists_,
            ) = fuzzy_simplicial_set(
                dmat,
                self._n_neighbors,
                random_state,
                "precomputed",
                self._metric_kwds,
                None,
                None,
                self.angular_rp_forest,
                self.set_op_mix_ratio,
                self.local_connectivity,
                True,
                self.verbose,
                self.densmap or self.output_dens,
            )
        else:
            # Standard case
            self._small_data = False
            # Standard case
            if self._sparse_data and self.metric in pynn_sparse_named_distances:
                nn_metric = self.metric
            elif not self._sparse_data and self.metric in pynn_named_distances:
                nn_metric = self.metric
            else:
                nn_metric = self._input_distance_func

            (
                self._knn_indices,
                self._knn_dists,
                self._knn_search_index,
            ) = nearest_neighbors(
                X[index],
                self._n_neighbors,
                nn_metric,
                self._metric_kwds,
                self.angular_rp_forest,
                random_state,
                self.low_memory,
                use_pynndescent=True,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
            )

            (
                self.graph_,
                self._sigmas,
                self._rhos,
                self.graph_dists_,
            ) = fuzzy_simplicial_set(
                X[index],
                self.n_neighbors,
                random_state,
                nn_metric,
                self._metric_kwds,
                self._knn_indices,
                self._knn_dists,
                self.angular_rp_forest,
                self.set_op_mix_ratio,
                self.local_connectivity,
                True,
                self.verbose,
                self.densmap or self.output_dens,
            )

        # Currently not checking if any duplicate points have differing labels
        # Might be worth throwing a warning...
        if y is not None:
            if self.densmap:
                raise NotImplementedError(
                    "Supervised embedding is not supported with densMAP.")

            len_X = len(X) if not self._sparse_data else X.shape[0]
            if len_X != len(y):
                raise ValueError(
                    "Length of x = {len_x}, length of y = {len_y}, while it must be equal."
                    .format(len_x=len_X, len_y=len(y)))
            y_ = check_array(y, ensure_2d=False)[index]
            if self.target_metric == "categorical":
                if self.target_weight < 1.0:
                    far_dist = 2.5 * (1.0 / (1.0 - self.target_weight))
                else:
                    far_dist = 1.0e12
                self.graph_ = discrete_metric_simplicial_set_intersection(
                    self.graph_, y_, far_dist=far_dist)
            elif self.target_metric in dist.DISCRETE_METRICS:
                if self.target_weight < 1.0:
                    scale = 2.5 * (1.0 / (1.0 - self.target_weight))
                else:
                    scale = 1.0e12
                # self.graph_ = discrete_metric_simplicial_set_intersection(
                #     self.graph_,
                #     y_,
                #     metric=self.target_metric,
                #     metric_kws=self.target_metric_kwds,
                #     metric_scale=scale
                # )

                metric_kws = dist.get_discrete_params(y_, self.target_metric)

                self.graph_ = discrete_metric_simplicial_set_intersection(
                    self.graph_,
                    y_,
                    metric=self.target_metric,
                    metric_kws=metric_kws,
                    metric_scale=scale,
                )
            else:
                if len(y_.shape) == 1:
                    y_ = y_.reshape(-1, 1)
                if self.target_n_neighbors == -1:
                    target_n_neighbors = self._n_neighbors
                else:
                    target_n_neighbors = self.target_n_neighbors

                # Handle the small case as precomputed as before
                if y.shape[0] < 4096:
                    try:
                        ydmat = pairwise_distances(y_,
                                                   metric=self.target_metric,
                                                   **self._target_metric_kwds)
                    except (TypeError, ValueError):
                        ydmat = dist.pairwise_special_metric(
                            y_,
                            metric=self.target_metric,
                            kwds=self._target_metric_kwds,
                        )

                    target_graph, target_sigmas, target_rhos = fuzzy_simplicial_set(
                        ydmat,
                        target_n_neighbors,
                        random_state,
                        "precomputed",
                        self._target_metric_kwds,
                        None,
                        None,
                        False,
                        1.0,
                        1.0,
                        False,
                    )
                else:
                    # Standard case
                    target_graph, target_sigmas, target_rhos = fuzzy_simplicial_set(
                        y_,
                        target_n_neighbors,
                        random_state,
                        self.target_metric,
                        self._target_metric_kwds,
                        None,
                        None,
                        False,
                        1.0,
                        1.0,
                        False,
                    )
                # product = self.graph_.multiply(target_graph)
                # # self.graph_ = 0.99 * product + 0.01 * (self.graph_ +
                # #                                        target_graph -
                # #                                        product)
                # self.graph_ = product
                self.graph_ = general_simplicial_set_intersection(
                    self.graph_, target_graph, self.target_weight)
                self.graph_ = reset_local_connectivity(self.graph_)
                self._supervised = True
        else:
            self._supervised = False

        # embed graph
        self.fit_embed_data(X, y, index, inverse)
        return self
Пример #16
0
def umap_conn_indices_dist_embedding(
    X,
    n_neighbors=30,
    n_components=2,
    metric="euclidean",
    min_dist=0.1,
    spread=1.0,
    n_epochs=0,
    alpha=1.0,
    gamma=1.0,
    negative_sample_rate=5,
    init_pos="spectral",
    random_state=0,
    return_mapper=True,
    verbose=False,
    **umap_kwargs
):
    """Compute connectivity graph, matrices for kNN neighbor indices, distance matrix and low dimension embedding with UMAP.
    This code is adapted from umap-learn (https://github.com/lmcinnes/umap/blob/97d33f57459de796774ab2d7fcf73c639835676d/umap/umap_.py)

    Arguments
    ---------
        X: sparse matrix (`.X`, dtype `float32`)
            expression matrix (n_cell x n_genes)
        n_neighbors: 'int' (optional, default 15)
            The number of nearest neighbors to compute for each sample in ``X``.
        n_components: 'int' (optional, default 2)
            The dimension of the space to embed into.
        metric: 'str' or `callable` (optional, default `cosine`)
            The metric to use for the computation.
        min_dist: 'float' (optional, default `0.1`)
            The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped
            embedding where nearby points on the manifold are drawn closer together, while larger values will result on a
            more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the
            scale at which embedded points will be spread out.
        spread: `float` (optional, default 1.0)
            The effective scale of embedded points. In combination with min_dist this determines how clustered/clumped the
            embedded points are.
        n_epochs: 'int' (optional, default 0)
            The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in
            more accurate embeddings. If None is specified a value will be selected based on the size of the input dataset
            (200 for large datasets, 500 for small).
        alpha: `float` (optional, default 1.0)
            Initial learning rate for the SGD.
        gamma: `float` (optional, default 1.0)
            Weight to apply to negative samples. Values higher than one will result in greater weight being given to
            negative samples.
        negative_sample_rate: `float` (optional, default 5)
            The number of negative samples to select per positive sample in the optimization process. Increasing this value
             will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
             The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low
             dimensional embedding.
        init_pos: 'spectral':
            How to initialize the low dimensional embedding. Use a spectral embedding of the fuzzy 1-skeleton
        random_state: `int`, `RandomState` instance or `None`, optional (default: None)
            If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is
            the random number generator; If None, the random number generator is the RandomState instance used by `numpy.random`.
        verbose: `bool` (optional, default False)
            Controls verbosity of logging.

    Returns
    -------
        graph, knn_indices, knn_dists, embedding_
            A tuple of kNN graph (`graph`), indices of nearest neighbors of each cell (knn_indicies), distances of nearest
            neighbors (knn_dists) and finally the low dimensional embedding (embedding_).
    """

    from sklearn.utils import check_random_state
    from sklearn.metrics import pairwise_distances
    from umap.umap_ import (
        nearest_neighbors,
        fuzzy_simplicial_set,
        simplicial_set_embedding,
        find_ab_params,
    )

    random_state = check_random_state(random_state)

    _raw_data = X

    if X.shape[0] < 4096:  # 1
        dmat = pairwise_distances(X, metric=metric)
        graph = fuzzy_simplicial_set(
            X=dmat,
            n_neighbors=n_neighbors,
            random_state=random_state,
            metric="precomputed",
            verbose=verbose,
        )
        if type(graph) == tuple: graph = graph[0]

        # extract knn_indices, knn_dist
        g_tmp = deepcopy(graph)
        g_tmp[graph.nonzero()] = dmat[graph.nonzero()]
        knn_indices, knn_dists = extract_indices_dist_from_graph(
            g_tmp, n_neighbors=n_neighbors
        )
    else:
        # Standard case
        (knn_indices, knn_dists, rp_forest) = nearest_neighbors(
            X=X,
            n_neighbors=n_neighbors,
            metric=metric,
            metric_kwds={},
            angular=False,
            random_state=random_state,
            verbose=verbose,
        )

        graph = fuzzy_simplicial_set(
            X=X,
            n_neighbors=n_neighbors,
            random_state=random_state,
            metric=metric,
            knn_indices=knn_indices,
            knn_dists=knn_dists,
            angular=rp_forest,
            verbose=verbose,
        )

        _raw_data = X
        _transform_available = True
        _search_graph = scipy.sparse.lil_matrix((X.shape[0], X.shape[0]), dtype=np.int8)
        _search_graph.rows = knn_indices  # An array (self.rows) of rows, each of which is a sorted list of column indices of non-zero elements.
        _search_graph.data = (knn_dists != 0).astype(
            np.int8
        )  # The corresponding nonzero values are stored in similar fashion in self.data.
        _search_graph = _search_graph.maximum(  # Element-wise maximum between this and another matrix.
            _search_graph.transpose()
        ).tocsr()

    if verbose:
        print("Construct embedding")

    a, b = find_ab_params(spread, min_dist)
    if type(graph) == tuple: graph = graph[0]
    embedding_ = simplicial_set_embedding(
        data=_raw_data,
        graph=graph,
        n_components=n_components,
        initial_alpha=alpha,  # learning_rate
        a=a,
        b=b,
        gamma=gamma,
        negative_sample_rate=negative_sample_rate,
        n_epochs=n_epochs,
        init=init_pos,
        random_state=random_state,
        metric=metric,
        metric_kwds={},
        verbose=verbose,
    )

    if return_mapper:
        import umap
        from .utils import update_dict

        if n_epochs == 0:
            n_epochs = None

        _umap_kwargs = {
            "angular_rp_forest": False,
            "local_connectivity": 1.0,
            "metric_kwds": None,
            "set_op_mix_ratio": 1.0,
            "target_metric": "categorical",
            "target_metric_kwds": None,
            "target_n_neighbors": -1,
            "target_weight": 0.5,
            "transform_queue_size": 4.0,
            "transform_seed": 42,
        }
        umap_kwargs = update_dict(_umap_kwargs, umap_kwargs)

        mapper = umap.UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            metric=metric,
            min_dist=min_dist,
            spread=spread,
            n_epochs=n_epochs,
            learning_rate=alpha,
            repulsion_strength=gamma,
            negative_sample_rate=negative_sample_rate,
            init=init_pos,
            random_state=random_state,
            verbose=verbose,
            **umap_kwargs
        ).fit(X)

        return mapper, graph, knn_indices, knn_dists, embedding_
    else:
        return graph, knn_indices, knn_dists, embedding_