예제 #1
0
	def fit(self, ds: loompy.LoomConnection, mu: np.ndarray = None, sd: np.ndarray = None, totals: np.ndarray = None) -> None:
		self.sd = sd
		self.mu = mu
		self.totals = totals

		if mu is None or sd is None:
			(self.sd, self.mu) = ds.map([np.std, np.mean], axis=0)
		if totals is None:
			self.totals = ds.map([np.sum], chunksize=100, axis=1)[0]
    def _fit(self, ds: loompy.LoomConnection,
             labels: np.ndarray) -> np.ndarray:
        logging.info("Computing enrichment statistic")
        n_labels = len(np.unique(labels))
        n_genes, n_cells = ds.shape

        # Number of cells per cluster
        sizes = np.bincount(labels, minlength=n_labels)
        # Number of nonzero values per cluster
        nnz = ds.aggregate(None, None, labels, np.count_nonzero, None)
        # Mean value per cluster
        means = ds.aggregate(None, None, labels, "mean", None)
        # Non-zeros and means over all cells
        (nnz_overall, means_overall) = ds.map([np.count_nonzero, np.mean],
                                              axis=0)
        # Scale by number of cells
        f_nnz = nnz / sizes
        f_nnz_overall = nnz_overall / n_cells

        # Means and fraction non-zero values in other clusters (per cluster)
        means_other = ((means_overall * n_cells)[None].T -
                       (means * sizes)) / (n_cells - sizes)
        f_nnz_other = ((f_nnz_overall * n_cells)[None].T -
                       (f_nnz * sizes)) / (n_cells - sizes)

        # enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01)
        enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / (
            means_other + 0.01)

        # Select best markers
        if self.valid_genes is None:
            logging.info("Identifying valid genes")
            nnz = ds.map([np.count_nonzero], axis=0)[0]
            self.valid_genes = np.logical_and(nnz > 10,
                                              nnz < ds.shape[1] * 0.6)

        if self.mask is None:
            excluded = set(np.where(~self.valid_genes)[0])
        else:
            excluded = set(np.where(((~self.valid_genes) | self.mask))[0])

        included = np.zeros(n_genes, dtype=bool)
        for ix in range(n_labels):
            enriched = np.argsort(enrichment[:, ix])[::-1]
            n = 0
            count = 0
            while count < self.n_markers_per_cluster:
                if enriched[n] in excluded:
                    n += 1
                    continue
                included[enriched[n]] = True
                excluded.add(enriched[n])
                n += 1
                count += 1
        return (included, enrichment, means)
예제 #3
0
	def fit(self, ds: loompy.LoomConnection, plot: str = None) -> np.ndarray:
		"""
		Fit a classifier and use it to determine cluster predictive power

		Args:
			ds		Dataset
			plot	Filename for optional plot

		Returns:
			Matrix of classification probabilities, shape (n_cells, n_labels)
		"""
		logging.info("Feature selection")
		nnz = ds.map([np.count_nonzero], axis=0)[0]
		valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int")
		ds.ra._Valid = valid_genes

		logging.info("Normalization")
		normalizer = cg.Normalizer(False)
		normalizer.fit(ds)

		logging.info("Feature selection")
		(_, enrichment, _) = cg.MarkerSelection(findq=False, labels_attr="Clusters").fit(ds)
		genes = np.zeros_like(ds.ra.Gene, dtype=bool)
		for ix in range(enrichment.shape[1]):
			genes[np.argsort(-enrichment[:, ix])[:25]] = True

		logging.info("PCA projection")
		pca = cg.PCAProjection(genes, max_n_components=50)
		transformed = pca.fit_transform(ds, normalizer)

		le = LabelEncoder().fit(ds.ca.ClusterName)
		self.le = le
		labels = le.transform(ds.ca.ClusterName)

		train_X, test_X, train_Y, test_Y = train_test_split(transformed, labels, test_size=0.2)
		classifier = RandomForestClassifier(max_depth=30)
		classifier.fit(train_X, train_Y)
		self.report = classification_report(test_Y, classifier.predict(test_X), target_names=le.classes_)
		self.proba = classifier.predict_proba(transformed)

		if plot:
			agg = npg.aggregate(labels, self.proba, axis=0, func="mean")
			plt.imshow(agg, cmap="viridis")
			plt.xticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="vertical", fontsize=7)
			plt.yticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="horizontal", fontsize=7)
			plt.xlabel("Predicted cell type")
			plt.ylabel("Observed cell type")
			plt.title("Predictive power of cluster identities")
			cbar = plt.colorbar()
			cbar.set_label('Average classification probability', rotation=90)
			plt.savefig(plot, bbox_inches="tight")

		return self.proba
예제 #4
0
    def fit(self,
            ds: loompy.LoomConnection,
            cells: np.ndarray = None,
            mu: np.ndarray = None,
            sd: np.ndarray = None,
            mask: np.ndarray = None) -> np.ndarray:
        """
		Fits a noise model (CV vs mean)

		Args:
			ds (LoomConnection):	Dataset
			n_genes (int):	number of genes to include
			cells (ndarray): cells to include when computing mean and CV (or None)
			mu, std: 		Precomputed mean and standard deviations (optional)

		Returns:
			ndarray of selected genes (list of ints)
		"""
        if mu is None or sd is None:
            (mu, sd) = ds.map((np.mean, np.std), axis=0, selection=cells)

        if "_Valid" in ds.ra:
            valid = ds.ra._Valid == 1
        else:
            valid = np.ones(ds.shape[0], dtype='bool')
        if mask is not None:
            valid = np.logical_and(valid, np.logical_not(mask))
        valid = np.logical_and(valid, ds.row_attrs["Gene"] != "Xist")
        valid = np.logical_and(valid, ds.row_attrs["Gene"] != "Tsix")
        valid = valid.astype('int')

        ok = np.logical_and(mu > 0, sd > 0)
        cv = sd[ok] / mu[ok]
        log2_m = np.log2(mu[ok])
        log2_cv = np.log2(cv)

        svr_gamma = 1000. / len(mu[ok])
        clf = SVR(gamma=svr_gamma)
        clf.fit(log2_m[:, np.newaxis], log2_cv)
        fitted_fun = clf.predict
        # Score is the relative position with respect of the fitted curve
        score = log2_cv - fitted_fun(log2_m[:, np.newaxis])
        score = score * valid[ok]
        self.genes = np.where(ok)[0][np.argsort(score)][-self.n_genes:]

        return self.genes
예제 #5
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        # Validating genes
        logging.info("Marking invalid genes")
        nnz = ds.map([np.count_nonzero], axis=0)[0]
        valid_genes = np.logical_and(nnz > 5,
                                     nnz < ds.shape[1] * 0.5).astype("int")
        ds.ra._Valid = valid_genes
        with open(os.path.join(self.classified_dir, "genes.txt"), "w") as f:
            for ix in range(valid_genes.shape[0]):
                f.write(ds.Accession[ix])
                f.write("\t")
                f.write(str(valid_genes[ix]))
                f.write("\n")

        logging.info("Normalization")
        normalizer = cg.Normalizer(True)
        normalizer.fit(ds)
        self.mu = normalizer.mu
        self.sd = normalizer.sd

        logging.info("Feature selection")
        genes = cg.FeatureSelection(2000).fit(ds)

        logging.info("PCA projection")
        self.pca = cg.PCAProjection(genes, max_n_components=50)
        transformed = self.pca.fit_transform(ds, normalizer)

        self.classes = ds.col_attrs["SubclassAssigned"]
        self.le = LabelEncoder().fit(self.classes)
        self.labels = self.le.transform(self.classes)

        train_X, test_X, train_Y, test_Y = train_test_split(transformed,
                                                            self.labels,
                                                            test_size=0.2,
                                                            random_state=0)
        self.classifier = SVC(probability=True)
        self.classifier.fit(train_X, train_Y)
        with open(os.path.join(self.classified_dir, "performance.txt"),
                  "w") as f:
            f.write(
                classification_report(test_Y,
                                      self.classifier.predict(test_X),
                                      target_names=self.le.classes_))
예제 #6
0
    def fit(
        self, ds: loompy.LoomConnection
    ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
        """
		Discover the manifold

		Returns:
			knn		The knn graph as a sparse matrix
			mknn	Mutual knn subgraph
			pos		2D projection (gt-SNE) as ndarray with shape (n_cells, 2)
		"""
        n_cells = ds.shape[1]
        logging.info("Processing all %d cells", n_cells)
        logging.info("Validating genes")
        nnz = ds.map([np.count_nonzero], axis=0)[0]
        valid_genes = np.logical_and(nnz > 5,
                                     nnz < ds.shape[1] * 0.5).astype("int")
        ds.ra._Valid = valid_genes
        logging.info("%d of %d genes were valid", np.sum(ds.ra._Valid == 1),
                     ds.shape[0])

        logging.info("Normalization")
        normalizer = cg.Normalizer(False)
        normalizer.fit(ds)
        logging.info("Selecting up to %d genes", self.n_genes)
        genes = cg.FeatureSelection(self.n_genes).fit(ds,
                                                      mu=normalizer.mu,
                                                      sd=normalizer.sd)

        logging.info("Loading data for selected genes")
        data = np.zeros((n_cells, genes.shape[0]))
        for (ix, selection, view) in ds.scan(axis=1):
            data[selection - ix, :] = view[genes, :].T

        logging.info("Computing initial subspace KNN")
        subspaces = np.ones(data.shape)
        knn = subspace_knn_graph(data, subspaces)
        mknn = knn.minimum(knn.transpose()).tocoo()

        for t in range(5):
            logging.info(f"Refining subspace KNN (iteration {t + 1})")

            logging.info("Louvain clustering")
            graph = nx.from_scipy_sparse_matrix(mknn)
            partitions = community.best_partition(graph)
            labels = np.array(
                [partitions[key] for key in range(mknn.shape[0])])
            ds.ca.Clusters = labels
            n_labels = np.max(labels) + 1
            logging.info(f"Found {n_labels} clusters")

            logging.info("Marker selection")
            (_, enrichment, _) = cg.MarkerSelection(n_markers=10,
                                                    findq=False).fit(ds)
            subspaces = np.zeros(data.shape)
            for ix in range(enrichment.shape[1]):
                for j in range(n_cells):
                    subspaces[j,
                              np.argsort(-enrichment[:, ix])[:self.n_genes //
                                                             n_labels]] = 1
            knn = subspace_knn_graph(data, subspaces)
            mknn = knn.minimum(knn.transpose()).tocoo()

        perplexity = min(self.k, (n_cells - 1) / 3 - 1)
        logging.info("gt-SNE layout")
        # Note that perplexity argument is ignored in this case, but must still be given
        # because bhtsne will check that it has a valid value
        tsne_pos = cg.TSNE(perplexity=perplexity).layout(data, knn=knn.tocsr())

        return (knn, mknn, tsne_pos)
예제 #7
0
	def _fit(self, ds: loompy.LoomConnection) -> Tuple[np.ndarray, np.ndarray]:
		"""
		Finds n_markers genes per cluster using enrichment score

		Args:
			ds (LoomConnection):	Dataset

		Returns:
			ndarray of selected genes (list of ints)
			ndarray of enrichment scores
		"""
		labels = ds.ca[self.labels_attr]
		n_labels = max(labels) + 1
		n_cells = ds.shape[1]

		# Number of cells per cluster
		sizes = np.bincount(labels, minlength=n_labels)
		# Number of nonzero values per cluster
		nnz = cg.aggregate_loom(ds, None, None, self.labels_attr, np.count_nonzero, None, return_matrix=True)
		# Mean value per cluster
		means = cg.aggregate_loom(ds, None, None, self.labels_attr, "mean", None, return_matrix=True)
		# Non-zeros and means over all cells
		(nnz_overall, means_overall) = ds.map([np.count_nonzero, np.mean], axis=0)
		# Scale by number of cells
		f_nnz = nnz / sizes
		f_nnz_overall = nnz_overall / n_cells

		# Means and fraction non-zero values in other clusters (per cluster)
		means_other = ((means_overall * n_cells)[None].T - (means * sizes)) / (n_cells - sizes)
		f_nnz_other = ((f_nnz_overall * n_cells)[None].T - (f_nnz * sizes)) / (n_cells - sizes)

		# enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01)
		enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / (means_other + 0.01)

		# Select best markers
		if "_Valid" not in ds.ra:
			logging.info("Recomputing the list of valid genes")
			nnz = ds.map([np.count_nonzero], axis=0)[0]
			valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6)
			ds.ra._Valid = valid_genes.astype('int')
			
		included: List[int] = []

		if self.mask is None:
			excluded = set(np.where(ds.row_attrs["_Valid"] == 0)[0])
		else:
			excluded = set(np.where(np.logical_and(ds.row_attrs["_Valid"] == 0, self.mask))[0])

		for ix in range(max(labels) + 1):
			enriched = np.argsort(enrichment[:, ix])[::-1]
			n = 0
			count = 0
			while count < self.n_markers:
				if enriched[n] in excluded:
					n += 1
					continue
				included.append(enriched[n])
				excluded.add(enriched[n])
				n += 1
				count += 1
		return (np.array(included), enrichment)
예제 #8
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        logging.info(f"Normalizing and selecting {self.n_genes} genes")
        normalizer = Normalizer(False)
        normalizer.fit(ds)
        genes = FeatureSelectionByVariance(self.n_genes,
                                           mask=self.mask).fit(ds)
        self.genes = genes

        if self.factorization == 'PCA' or self.factorization == 'both' or self.batch_keys is not None:
            factorization = "PCA"
        else:
            factorization = "HPF"

        if factorization == "PCA":
            n_components = min(50, ds.shape[1])
            logging.info("PCA projection to %d components", n_components)
            pca = PCA(genes,
                      max_n_components=self.n_factors,
                      test_significance=False,
                      batch_keys=self.batch_keys)
            transformed = pca.fit_transform(ds, normalizer)
        else:
            data = ds.sparse(rows=genes).T
            # Subsample to lowest number of UMIs
            if "TotalUMI" in ds.ca:
                totals = ds.ca.TotalUMI
            else:
                totals = ds.map([np.sum], axis=1)[0]
            min_umis = int(np.min(totals))
            logging.debug(f"Subsampling to {min_umis} UMIs")
            temp = data.toarray()
            for c in range(temp.shape[0]):
                temp[c, :] = np.random.binomial(temp[c, :].astype('int32'),
                                                min_umis / totals[c])
            data = sparse.coo_matrix(temp)

            # HPF factorization
            hpf = HPF(k=self.n_factors,
                      validation_fraction=0.05,
                      min_iter=10,
                      max_iter=200,
                      compute_X_ppv=False,
                      n_threads=self.n_threads)
            hpf.fit(data)
            transformed = (
                hpf.theta.T / hpf.theta.sum(axis=1)
            ).T  # Normalize so the sums are one because JSD requires it

        # KNN in latent space
        logging.info(f"Computing KNN (k={self.k_pooling}) in latent space")
        with warnings.catch_warnings():
            warnings.simplefilter(
                "ignore", category=NumbaPerformanceWarning
            )  # Suppress warnings about numba not being able to parallelize code
            warnings.simplefilter(
                "ignore", category=NumbaPendingDeprecationWarning
            )  # Suppress warnings about future deprecations
            warnings.simplefilter(
                "ignore", category=SparseEfficiencyWarning
            )  # Suppress warnings about setting the diagonal to 1
            nn = NNDescent(data=transformed,
                           metric=(jensen_shannon_distance
                                   if factorization == "HPF" else "euclidean"))
            indices, distances = nn.query(transformed, k=self.k_pooling)
            # Note: we convert distances to similarities here, to support Poisson smoothing below
            knn = sparse.csr_matrix(
                (np.ravel(distances), np.ravel(indices),
                 np.arange(0, distances.shape[0] * distances.shape[1] + 1,
                           distances.shape[1])),
                (transformed.shape[0], transformed.shape[0]))
            max_d = knn.data.max()
            knn.data = (max_d - knn.data) / max_d
            knn.setdiag(
                1
            )  # This causes a sparse efficiency warning, but it's not a slow step relative to everything else
            self.knn = knn
예제 #9
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        logging.info(f"Running cytograph on {ds.shape[1]} cells")
        if self.config.params.factorization not in ["PCA", "HPF", "both"]:
            raise ValueError(
                "params.factorization must be either 'PCA' or 'HPF' or 'both'")
        if self.config.params.features not in ["enrichment", "variance"]:
            raise ValueError(
                "params.features must be either 'enrichment' or 'variance'")
        if self.config.params.nn_space not in ["PCA", "HPF", "auto"]:
            raise ValueError(
                "params.nn_space must be either 'PCA' or 'HPF' or 'auto'")
        if not ((self.config.params.nn_space in ["PCA", "auto"]
                 and self.config.params.factorization in ["PCA", "both"]) or
                (self.config.params.nn_space in ["HPF", "auto"]
                 and self.config.params.factorization in ["HPF", "both"])):
            raise ValueError(
                f"config.params.nn_space = '{self.config.params.nn_space}' is incompatible with config.params.factorization = '{self.config.params.factorization}'"
            )

        species = Species.detect(ds)
        logging.info(f"Species is '{species.name}'")

        logging.info("Recomputing the list of valid genes")
        nnz = ds.map([np.count_nonzero], axis=0)[0]
        valid_genes = (nnz > 10) & (nnz < ds.shape[1] * 0.6)
        ds.ra.Valid = valid_genes.astype('int')

        # Perform Poisson pooling if requested
        main_layer = ""
        if "poisson_pooling" in self.config.steps:
            logging.info(
                f"Poisson pooling with k_pooling == {self.config.params.k_pooling}"
            )
            main_layer = "pooled"  # if not in config.steps, use the main layer
            pp = PoissonPooling(self.config.params.k_pooling,
                                self.config.params.n_genes,
                                compute_velocity=False,
                                n_threads=self.config.execution.n_cpus,
                                factorization=self.config.params.factorization,
                                batch_keys=self.config.params.batch_keys)
            pp.fit_transform(ds)

        # Select features
        if self.config.params.features == "enrichment":
            logging.info(
                f"Feature selection by enrichment on preliminary clusters")
            with warnings.catch_warnings():
                warnings.simplefilter(
                    "ignore", category=NumbaPerformanceWarning
                )  # Suppress warnings about numba not being able to parallelize code
                warnings.simplefilter(
                    "ignore", category=NumbaPendingDeprecationWarning
                )  # Suppress warnings about future deprecations
                warnings.simplefilter(
                    "ignore", category=SparseEfficiencyWarning
                )  # Suppress warnings about setting the diagonal to 1
                logging.info(f"  Gene selection for PCA")
                genes = FeatureSelectionByVariance(
                    self.config.params.n_genes,
                    mask=Species.mask(ds, self.config.params.mask)).fit(ds)
                logging.info(f"  Factorization by PCA")
                normalizer = Normalizer(False)
                normalizer.fit(ds)
                logging.info("  PCA projection to %d components",
                             self.config.params.n_factors)
                pca = PCA(genes,
                          max_n_components=self.config.params.n_factors,
                          layer=main_layer,
                          test_significance=False,
                          batch_keys=self.config.params.batch_keys)
                transformed = pca.fit_transform(ds, normalizer)
                logging.info(
                    f"  Computing KNN (k={self.config.params.k}) in PCA space")
                nn = NNDescent(data=transformed, metric="euclidean")
                indices, distances = nn.query(transformed,
                                              k=self.config.params.k)
                indices = indices[:, 1:]
                distances = distances[:, 1:]
                knn = sparse.csr_matrix(
                    (np.ravel(distances), np.ravel(indices),
                     np.arange(0, distances.shape[0] * distances.shape[1] + 1,
                               distances.shape[1])),
                    (transformed.shape[0], transformed.shape[0]))

            g = nx.from_scipy_sparse_matrix(knn)
            partitions = community.best_partition(g,
                                                  resolution=1,
                                                  randomize=False)
            ds.ca.Clusters = np.array(
                [partitions[key] for key in range(knn.shape[0])])
            n_labels = ds.ca.Clusters.max() + 1
            genes = FeatureSelectionByEnrichment(
                int(self.config.params.n_genes // n_labels),
                Species.mask(ds, self.config.params.mask),
                findq=False).select(ds)
        elif self.config.params.features == "variance":
            logging.info(f"Feature selection by variance")
            genes = FeatureSelectionByVariance(
                self.config.params.n_genes, main_layer,
                Species.mask(ds, self.config.params.mask)).select(ds)
        logging.info(f"Selected {genes.sum()} genes")

        if self.config.params.factorization in ['PCA', 'both']:
            logging.info(f"Factorization by PCA")
            normalizer = Normalizer(False)
            normalizer.fit(ds)
            n_components = min(self.config.params.n_factors, ds.shape[1])
            logging.info("  PCA projection to %d components", n_components)
            pca = PCA(genes,
                      max_n_components=n_components,
                      layer=main_layer,
                      test_significance=False,
                      batch_keys=self.config.params.batch_keys)
            ds.ca.PCA = pca.fit_transform(ds, normalizer)

        if self.config.params.factorization in ['HPF', 'both']:
            logging.info(f"Factorization by HPF")
            # Load the data for the selected genes
            data = ds[main_layer].sparse(rows=genes).T
            logging.debug(f"  Data shape is {data.shape}")

            # HPF factorization
            hpf = HPF(k=self.config.params.n_factors,
                      validation_fraction=0.05,
                      min_iter=10,
                      max_iter=200,
                      compute_X_ppv=False,
                      n_threads=self.config.execution.n_cpus)
            hpf.fit(data)
            beta_all = np.zeros((ds.shape[0], hpf.beta.shape[1]))
            beta_all[genes] = hpf.beta
            # Save the unnormalized factors
            ds.ra.HPF_beta = beta_all
            ds.ca.HPF_theta = hpf.theta
            # Here we normalize so the sums over components are one, because JSD requires it
            # and because otherwise the components will be exactly proportional to cell size
            theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T
            beta = (hpf.beta.T / hpf.beta.sum(axis=1)).T
            beta_all[genes] = beta
            # Save the normalized factors
            ds.ra.HPF = beta_all
            ds.ca.HPF = theta

        if "nn" in self.config.steps or "clustering" in self.config.steps:
            if self.config.params.nn_space in ["PCA", "auto"
                                               ] and "PCA" in ds.ca:
                transformed = ds.ca.PCA
                metric = "euclidean"
            elif self.config.params.nn_space in ["HPF", "auto"
                                                 ] and "HPF" in ds.ca:
                transformed = ds.ca.HPF
                metric = "js"
            logging.info(
                f"Computing balanced KNN (k = {self.config.params.k}) in {self.config.params.nn_space} space using the '{metric}' metric"
            )
            bnn = BalancedKNN(k=self.config.params.k,
                              metric=metric,
                              maxl=2 * self.config.params.k,
                              sight_k=2 * self.config.params.k,
                              n_jobs=-1)
            bnn.fit(transformed)
            knn = bnn.kneighbors_graph(mode='distance')
            knn.eliminate_zeros()
            mknn = knn.minimum(knn.transpose())
            # Convert distances to similarities
            max_d = knn.data.max()
            knn.data = (max_d - knn.data) / max_d
            mknn.data = (max_d - mknn.data) / max_d
            ds.col_graphs.KNN = knn
            ds.col_graphs.MKNN = mknn
            mknn = mknn.tocoo()
            mknn.setdiag(0)
            # Compute the effective resolution
            d = 1 - knn.data
            radius = np.percentile(d, 90)
            logging.info(f"  90th percentile radius: {radius:.02}")
            ds.attrs.radius = radius
            inside = mknn.data > 1 - radius
            rnn = sparse.coo_matrix(
                (mknn.data[inside], (mknn.row[inside], mknn.col[inside])),
                shape=mknn.shape)
            ds.col_graphs.RNN = rnn

        if "embeddings" in self.config.steps or "clustering" in self.config.steps:
            logging.info(f"Computing 2D and 3D embeddings from latent space")
            metric_f = (
                jensen_shannon_distance if metric == "js" else metric
            )  # Replace js with the actual function, since OpenTSNE doesn't understand js
            logging.info(f"  Art of tSNE with {metric} distance metric")
            ds.ca.TSNE = np.array(
                art_of_tsne(transformed, metric=metric_f)
            )  # art_of_tsne returns a TSNEEmbedding, which can be cast to an ndarray (its actually just a subclass)
            logging.info(f"  UMAP with {metric} distance metric")
            ds.ca.UMAP = UMAP(n_components=2,
                              metric=metric_f,
                              n_neighbors=self.config.params.k // 2,
                              learning_rate=0.3,
                              min_dist=0.25).fit_transform(transformed)
            ds.ca.UMAP3D = UMAP(n_components=3,
                                metric=metric_f,
                                n_neighbors=self.config.params.k // 2,
                                learning_rate=0.3,
                                min_dist=0.25).fit_transform(transformed)

        if "clustering" in self.config.steps:
            logging.info("Clustering by polished Louvain")
            pl = PolishedLouvain(outliers=False, graph="RNN", embedding="TSNE")
            labels = pl.fit_predict(ds)
            ds.ca.ClustersModularity = labels + min(labels)
            ds.ca.OutliersModularity = (labels == -1).astype('int')

            logging.info("Clustering by polished Surprise")
            ps = PolishedSurprise(graph="RNN", embedding="TSNE")
            labels = ps.fit_predict(ds)
            ds.ca.ClustersSurprise = labels + min(labels)
            ds.ca.OutliersSurprise = (labels == -1).astype('int')

            if self.config.params.clusterer == "louvain":
                ds.ca.Clusters = ds.ca.ClustersModularity
                ds.ca.Outliers = ds.ca.OutliersModularity
            else:
                ds.ca.Clusters = ds.ca.ClustersSurprise
                ds.ca.Outliers = ds.ca.OutliersSurprise

            logging.info(f"Found {ds.ca.Clusters.max() + 1} clusters")

        if species.name in ["H**o sapiens", "Mus musculus"]:
            logging.info(f"Inferring cell cycle")
            CellCycleAnnotator(species).annotate(ds)