Exemplo n.º 1
0
	def fit(self, ds: loompy.LoomConnection, plot: str = None) -> np.ndarray:
		"""
		Fit a classifier and use it to determine cluster predictive power

		Args:
			ds		Dataset
			plot	Filename for optional plot

		Returns:
			Matrix of classification probabilities, shape (n_cells, n_labels)
		"""
		logging.info("Feature selection")
		nnz = ds.map([np.count_nonzero], axis=0)[0]
		valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int")
		ds.ra._Valid = valid_genes

		logging.info("Normalization")
		normalizer = cg.Normalizer(False)
		normalizer.fit(ds)

		logging.info("Feature selection")
		(_, enrichment, _) = cg.MarkerSelection(findq=False, labels_attr="Clusters").fit(ds)
		genes = np.zeros_like(ds.ra.Gene, dtype=bool)
		for ix in range(enrichment.shape[1]):
			genes[np.argsort(-enrichment[:, ix])[:25]] = True

		logging.info("PCA projection")
		pca = cg.PCAProjection(genes, max_n_components=50)
		transformed = pca.fit_transform(ds, normalizer)

		le = LabelEncoder().fit(ds.ca.ClusterName)
		self.le = le
		labels = le.transform(ds.ca.ClusterName)

		train_X, test_X, train_Y, test_Y = train_test_split(transformed, labels, test_size=0.2)
		classifier = RandomForestClassifier(max_depth=30)
		classifier.fit(train_X, train_Y)
		self.report = classification_report(test_Y, classifier.predict(test_X), target_names=le.classes_)
		self.proba = classifier.predict_proba(transformed)

		if plot:
			agg = npg.aggregate(labels, self.proba, axis=0, func="mean")
			plt.imshow(agg, cmap="viridis")
			plt.xticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="vertical", fontsize=7)
			plt.yticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="horizontal", fontsize=7)
			plt.xlabel("Predicted cell type")
			plt.ylabel("Observed cell type")
			plt.title("Predictive power of cluster identities")
			cbar = plt.colorbar()
			cbar.set_label('Average classification probability', rotation=90)
			plt.savefig(plot, bbox_inches="tight")

		return self.proba
Exemplo n.º 2
0
    def fit(
        self, ds: loompy.LoomConnection
    ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
        """
		Discover the manifold

		Returns:
			knn		The knn graph as a sparse matrix
			mknn	Mutual knn subgraph
			pos		2D projection (gt-SNE) as ndarray with shape (n_cells, 2)
		"""
        n_cells = ds.shape[1]
        logging.info("Processing all %d cells", n_cells)
        logging.info("Validating genes")
        nnz = ds.map([np.count_nonzero], axis=0)[0]
        valid_genes = np.logical_and(nnz > 5,
                                     nnz < ds.shape[1] * 0.5).astype("int")
        ds.ra._Valid = valid_genes
        logging.info("%d of %d genes were valid", np.sum(ds.ra._Valid == 1),
                     ds.shape[0])

        logging.info("Normalization")
        normalizer = cg.Normalizer(False)
        normalizer.fit(ds)
        logging.info("Selecting up to %d genes", self.n_genes)
        genes = cg.FeatureSelection(self.n_genes).fit(ds,
                                                      mu=normalizer.mu,
                                                      sd=normalizer.sd)

        logging.info("Loading data for selected genes")
        data = np.zeros((n_cells, genes.shape[0]))
        for (ix, selection, view) in ds.scan(axis=1):
            data[selection - ix, :] = view[genes, :].T

        logging.info("Computing initial subspace KNN")
        subspaces = np.ones(data.shape)
        knn = subspace_knn_graph(data, subspaces)
        mknn = knn.minimum(knn.transpose()).tocoo()

        for t in range(5):
            logging.info(f"Refining subspace KNN (iteration {t + 1})")

            logging.info("Louvain clustering")
            graph = nx.from_scipy_sparse_matrix(mknn)
            partitions = community.best_partition(graph)
            labels = np.array(
                [partitions[key] for key in range(mknn.shape[0])])
            ds.ca.Clusters = labels
            n_labels = np.max(labels) + 1
            logging.info(f"Found {n_labels} clusters")

            logging.info("Marker selection")
            (_, enrichment, _) = cg.MarkerSelection(n_markers=10,
                                                    findq=False).fit(ds)
            subspaces = np.zeros(data.shape)
            for ix in range(enrichment.shape[1]):
                for j in range(n_cells):
                    subspaces[j,
                              np.argsort(-enrichment[:, ix])[:self.n_genes //
                                                             n_labels]] = 1
            knn = subspace_knn_graph(data, subspaces)
            mknn = knn.minimum(knn.transpose()).tocoo()

        perplexity = min(self.k, (n_cells - 1) / 3 - 1)
        logging.info("gt-SNE layout")
        # Note that perplexity argument is ignored in this case, but must still be given
        # because bhtsne will check that it has a valid value
        tsne_pos = cg.TSNE(perplexity=perplexity).layout(data, knn=knn.tocsr())

        return (knn, mknn, tsne_pos)
Exemplo n.º 3
0
	def fit(self, ds: loompy.LoomConnection) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
		"""
		Discover the manifold
		Args:
			n_genes		Number of genes to use for manifold learning (ignored if genes is not None)
			gtsnse		Use graph t-SNE for layout (default: standard tSNE)
			alpha		The scale parameter for multiscale KNN
			genes		List of genes to use for manifold learning

		Returns:
			knn		The multiscale knn graph as a sparse matrix, with k = 100
			mknn	Mutual knn subgraph, with k = 20
			pos		2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2)
		"""
		n_valid = np.sum(ds.col_attrs["_Valid"] == 1)
		n_total = ds.shape[1]
		logging.info("%d of %d cells were valid", n_valid, n_total)
		logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0])
		cells = np.where(ds.col_attrs["_Valid"] == 1)[0]

		logging.info("Normalization")
		normalizer = cg.Normalizer(False)
		normalizer.fit(ds)

		if self.filter_cellcycle is not None:
			cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split())
			mask = np.in1d(ds.ra.Gene, cell_cycle_genes)
			if np.sum(mask) == 0:
				logging.warn("None cell cycle genes where filtered, check your gene list")
		else:
			mask = None

		if self.genes is None:
			logging.info("Selecting up to %d genes", self.n_genes)
			genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask)
			temp = np.zeros(ds.shape[0])
			temp[genes] = 1
			ds.set_attr("_Selected", temp, axis=0)
			logging.info("%d genes selected", temp.sum())

			n_components = min(50, n_valid)
			logging.info("PCA projection to %d components", n_components)
			pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer)
			pca_transformed = pca.fit_transform(ds, normalizer, cells=cells)
			transformed = pca_transformed

			logging.info("Generating KNN graph")
			k = min(10, n_valid - 1)
			nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
			nn.fit(transformed)
			knn = nn.kneighbors_graph(mode='connectivity')
			knn = knn.tocoo()
			mknn = knn.minimum(knn.transpose()).tocoo()

			logging.info("Louvain-Jaccard clustering")
			lj = cg.LouvainJaccard(resolution=1)
			labels = lj.fit_predict(knn)

			# Make labels for excluded cells == -1
			labels_all = np.zeros(ds.shape[1], dtype='int') + -1
			labels_all[cells] = labels
			ds.set_attr("Clusters", labels_all, axis=1)
			n_labels = np.max(labels) + 1
			logging.info("Found " + str(n_labels) + " LJ clusters")

			logging.info("Marker selection")
			(genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask).fit(ds)
		else:
			genes = self.genes

		temp = np.zeros(ds.shape[0])
		temp[genes] = 1
		ds.set_attr("_Selected", temp, axis=0)
		logging.info("%d genes selected", temp.sum())

		n_components = min(50, n_valid)
		logging.info("PCA projection to %d components", n_components)
		pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer)
		pca_transformed = pca.fit_transform(ds, normalizer, cells=cells)
		transformed = pca_transformed

		logging.info("Generating KNN graph")
		k = min(10, n_valid - 1)
		nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
		nn.fit(transformed)
		knn = nn.kneighbors_graph(mode='connectivity')
		knn = knn.tocoo()
		mknn = knn.minimum(knn.transpose()).tocoo()

		logging.info("Louvain-Jaccard clustering")
		lj = cg.LouvainJaccard(resolution=1)
		labels = lj.fit_predict(knn)

		# Make labels for excluded cells == -1
		labels_all = np.zeros(ds.shape[1], dtype='int') + -1
		labels_all[cells] = labels
		ds.set_attr("Clusters", labels_all, axis=1)
		n_labels = np.max(labels) + 1
		logging.info("Found " + str(n_labels) + " LJ clusters")

		logging.info("Marker selection")
		(genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds)

		# Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA
		cells_adjusted = cg.cap_select(labels, cells, int(n_valid * 0.2))
		n_components = min(50, cells_adjusted.shape[0])
		logging.info("PCA projection to %d components", n_components)
		pca = cg.PCAProjection(genes, max_n_components=n_components)
		pca.fit(ds, normalizer, cells=cells_adjusted)
		# Note that here we're transforming all cells; we just did the fit on the selection
		transformed = pca.transform(ds, normalizer, cells=cells)

		k = min(100, n_valid - 1)
		logging.info("Generating multiscale KNN graph (k = %d)", k)
		nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
		nn.fit(transformed)
		knn = nn.kneighbors(return_distance=False)  # shape: (n_cells, k)
		n_cells = knn.shape[0]
		a = np.tile(np.arange(n_cells), k)
		b = np.reshape(knn.T, (n_cells * k,))
		w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells)
		knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
		threshold = w > 0.05
		mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells))
		mknn = mknn.minimum(mknn.transpose()).tocoo()

		perplexity = min(k, (n_valid - 1) / 3 - 1)
		if self.gtsne:
			logging.info("gt-SNE layout")
			# Note that perplexity argument is ignored in this case, but must still be given
			# because bhtsne will check that it has a valid value
			tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed, knn=knn.tocsr())
		else:
			logging.info("t-SNE layout")
			tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed)
		tsne_all = np.zeros((ds.shape[1], 2), dtype='int') + np.min(tsne_pos, axis=0)
		tsne_all[cells] = tsne_pos

		# Transform back to the full set of cells
		knn = sparse.coo_matrix((knn.data, (cells[knn.row], cells[knn.col])), shape=(n_total, n_total))
		mknn = sparse.coo_matrix((mknn.data, (cells[mknn.row], cells[mknn.col])), shape=(n_total, n_total))

		return (knn, mknn, tsne_all)
Exemplo n.º 4
0
	def fit(self, ds: loompy.LoomConnection, initial_pos: np.ndarray = None, nng: np.ndarray = None, blocked_genes: np.ndarray = None) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
		"""
		Discover the manifold
		Args:
			n_genes			Number of genes to use for manifold learning (ignored if genes is not None)
			gtsnse			Use graph t-SNE for layout (default: standard tSNE)
			alpha			The scale parameter for multiscale KNN
			genes			List of genes to use for manifold learning
			initial_pos		Use this initial layout, shape (ds.shape[1], 2)
			nng				Non-neuronal genes, set these to zero in neurons (mask array)
			blocked_gens	Don't use these genes (mask array)

		Returns:
			knn		The multiscale knn graph as a sparse matrix, with k = 100
			mknn	Mutual knn subgraph, with k = 20
			pos		2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2)
		"""
		n_cells = ds.shape[1]
		logging.info("Processing all %d cells", n_cells)
		logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0])

		logging.info("Normalization")
		normalizer = cg.Normalizer(False)
		normalizer.fit(ds)

		if self.filter_cellcycle is not None:
			cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split())
			mask = np.in1d(ds.ra.Gene, cell_cycle_genes)
			if np.sum(mask) == 0:
				logging.warn("None cell cycle genes where filtered, check your gene list")
		else:
			mask = None
		
		if blocked_genes is not None:
			if mask is None:
				mask = blocked_genes
			else:
				mask = mask & blocked_genes

		if self.genes is None:
			logging.info("Selecting up to %d genes", self.n_genes)
			genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask)

			n_components = min(50, n_cells)
			logging.info("PCA projection to %d components", n_components)
			pca = cg.PCAProjection(genes, max_n_components=n_components)
			pca_transformed = pca.fit_transform(ds, normalizer)
			transformed = pca_transformed

			logging.info("Generating balanced KNN graph")
			np.random.seed(0)
			k = min(self.k, n_cells - 1)
			bnn = cg.BalancedKNN(k=k, maxl=2 * k, sight_k=2 * k)
			bnn.fit(transformed)
			knn = bnn.kneighbors_graph(mode='connectivity')
			knn = knn.tocoo()
			mknn = knn.minimum(knn.transpose()).tocoo()

			logging.info("MKNN-Louvain clustering with outliers")
			(a, b, w) = (mknn.row, mknn.col, mknn.data)
			random.seed(13)
			lj = cg.LouvainJaccard(resolution=1, jaccard=False)
			labels = lj.fit_predict(knn)
			bigs = np.where(np.bincount(labels) >= 10)[0]
			mapping = {k: v for v, k in enumerate(bigs)}
			labels = np.array([mapping[x] if x in bigs else -1 for x in labels])

			n_labels = np.max(labels) + 1
			logging.info("Found " + str(n_labels) + " clusters")

			logging.info("Marker selection")
			temp = None
			if "Clusters" in ds.ca:
				temp = ds.ca.Clusters
			ds.ca.Clusters = labels - labels.min()
			(genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask, findq=False).fit(ds)
			if temp is not None:
				ds.ca.Clusters = temp
		else:
			genes = self.genes

		temp = np.zeros(ds.shape[0], dtype='bool')
		temp[genes] = True
		ds.ra._Selected = temp.astype('int')
		logging.info("%d genes selected", temp.sum())

		if self.genes is None:
			# Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA
			cells_adjusted = cg.cap_select(labels - labels.min(), np.arange(n_cells), int(n_cells * 0.2))
			n_components = min(50, cells_adjusted.shape[0])
			logging.info("PCA projection to %d components", n_components)
			pca = cg.PCAProjection(genes, max_n_components=n_components)
			pca.fit(ds, normalizer, cells=cells_adjusted)
		else:
			n_components = min(50, n_cells)
			logging.info("PCA projection to %d components", n_components)
			pca = cg.PCAProjection(genes, max_n_components=n_components)
			pca.fit(ds, normalizer)
			
		# Note that here we're transforming all cells; we just did the fit on the selection
		transformed = pca.transform(ds, normalizer)

		k = min(self.k, n_cells - 1)
		logging.info("Generating multiscale KNN graph (k = %d)", k)
		bnn = cg.BalancedKNN(k=k, maxl=2 * k, sight_k=2 * k)
		bnn.fit(transformed)
		knn = bnn.kneighbors(mode='connectivity')[1][:, 1:]
		n_cells = knn.shape[0]
		a = np.tile(np.arange(n_cells), k)
		b = np.reshape(knn.T, (n_cells * k,))
		w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells)
		knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
		threshold = w > 0.025
		mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells))
		mknn = mknn.minimum(mknn.transpose()).tocoo()

		perplexity = min(k, (n_cells - 1) / 3 - 1)
		if self.gtsne:
			logging.info("gt-SNE layout")
			# Note that perplexity argument is ignored in this case, but must still be given
			# because bhtsne will check that it has a valid value
			tsne_pos = cg.TSNE(perplexity=perplexity, max_iter=self.max_iter).layout(transformed, knn=knn.tocsr(), initial_pos=initial_pos)
		else:
			logging.info("t-SNE layout")
			tsne_pos = cg.TSNE(perplexity=perplexity, max_iter=self.max_iter).layout(transformed, initial_pos=initial_pos)

		return (knn, mknn, tsne_pos)
Exemplo n.º 5
0
    def aggregate(self,
                  ds: loompy.LoomConnection,
                  out_file: str,
                  agg_spec: Dict[str, str] = None) -> None:
        if agg_spec is None:
            agg_spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean"
            }
        cells = ds.col_attrs["Clusters"] >= 0
        labels = ds.col_attrs["Clusters"][cells]
        n_labels = len(set(labels))

        logging.info("Aggregating clusters by mean")
        cg.aggregate_loom(ds, out_file, None, "Clusters", "mean", agg_spec)
        with loompy.connect(out_file) as dsout:
            logging.info("Trinarizing")
            if type(self.f) is list or type(self.f) is tuple:
                for ix, f in enumerate(self.f):
                    trinaries = cg.Trinarizer(f=f).fit(ds)
                    if ix == 0:
                        dsout.layers["trinaries"] = trinaries
                    else:
                        dsout.layers[f"trinaries_{f}"] = trinaries
            else:
                trinaries = cg.Trinarizer(f=self.f).fit(ds)
                dsout.layers["trinaries"] = trinaries

            logging.info("Computing cluster gene enrichment scores")
            (markers, enrichment,
             qvals) = cg.MarkerSelection(self.n_markers).fit(ds)
            dsout.layers["enrichment"] = enrichment
            dsout.layers["enrichment_q"] = qvals

            dsout.ca.NCells = np.bincount(labels, minlength=n_labels)

            # Renumber the clusters
            logging.info(
                "Renumbering clusters by similarity, and permuting columns")
            if "_Selected" in ds.ra:
                genes = (ds.ra._Selected == 1)
            else:
                logging.info("Normalization")
                normalizer = cg.Normalizer(False)
                normalizer.fit(ds)
                logging.info("Selecting up to 1000 genes")
                genes = cg.FeatureSelection(1000).fit(ds,
                                                      mu=normalizer.mu,
                                                      sd=normalizer.sd)

            data = np.log(dsout[:, :] + 1)[genes, :].T
            D = pdist(data, 'euclidean')
            Z = hc.linkage(D, 'ward')
            optimal_Z = optimal_leaf_ordering(Z, D)
            ordering = hc.leaves_list(optimal_Z)

            # Permute the aggregated file, and renumber
            dsout.permute(ordering, axis=1)
            dsout.ca.Clusters = np.arange(n_labels)

            # Renumber the original file, and permute
            d = dict(zip(ordering, np.arange(n_labels)))
            new_clusters = np.array(
                [d[x] if x in d else -1 for x in ds.ca.Clusters])
            ds.ca.Clusters = new_clusters
            ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1)

            # Reorder the genes, markers first, ordered by enrichment in clusters
            logging.info("Permuting rows")
            mask = np.zeros(ds.shape[0], dtype=bool)
            mask[markers] = True
            # fetch enrichment from the aggregated file, so we get it already permuted on the column axis
            gene_order = np.zeros(ds.shape[0], dtype='int')
            gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :],
                                         axis=1)
            gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :],
                                          axis=1) + dsout.shape[1]
            gene_order = np.argsort(gene_order)
            ds.permute(gene_order, axis=0)
            dsout.permute(gene_order, axis=0)

            data = trinaries[:, ordering][gene_order, :][:self.n_markers *
                                                         n_labels, :].T
            cluster_scores = []
            for ix in range(n_labels):
                cluster_scores.append(data[ix, ix * 10:(ix + 1) * 10].sum())
            dsout.ca.ClusterScore = np.array(cluster_scores)
Exemplo n.º 6
0
    def fit(
        self, ds: loompy.LoomConnection
    ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
        """
		Discover the manifold
		Args:
			n_genes		Number of genes to use for manifold learning (ignored if genes is not None)
			gtsnse		Use graph t-SNE for layout (default: standard tSNE)
			alpha		The scale parameter for multiscale KNN
			genes		List of genes to use for manifold learning

		Returns:
			knn		The multiscale knn graph as a sparse matrix, with k = 100
			mknn	Mutual knn subgraph, with k = 20
			pos		2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2)
		"""
        n_cells = ds.shape[1]
        logging.info("Processing all %d cells", n_cells)
        logging.info("%d of %d genes were valid",
                     np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0])

        logging.info("Normalization")
        normalizer = cg.Normalizer(False)
        normalizer.fit(ds)

        if self.filter_cellcycle is not None:
            cell_cycle_genes = np.array(
                open(self.filter_cellcycle).read().split())
            mask = np.in1d(ds.row_attrs["Gene"], cell_cycle_genes)
            if np.sum(mask) == 0:
                logging.warn(
                    "None cell cycle genes where filtered, check your gene list"
                )
        else:
            mask = None

        if self.genes is None:
            logging.info("Selecting up to %d genes", self.n_genes)
            genes = cg.FeatureSelection(self.n_genes).fit(ds,
                                                          mu=normalizer.mu,
                                                          sd=normalizer.sd,
                                                          mask=mask)

            n_components = min(50, n_cells)
            logging.info("PCA projection to %d components", n_components)
            pca = cg.PCAProjection(genes, max_n_components=n_components)
            pca_transformed = pca.fit_transform(ds, normalizer)
            transformed = pca_transformed

            logging.info("Generating balanced KNN graph")
            k = min(self.k, n_cells - 1)
            bnn = cg.BalancedKNN(k=k, maxl=2 * k)
            bnn.fit(transformed)
            knn = bnn.kneighbors_graph(mode='connectivity')
            knn = knn.tocoo()
            mknn = knn.minimum(knn.transpose()).tocoo()

            logging.info("MKNN-Louvain clustering with outliers")
            (a, b, w) = (mknn.row, mknn.col, mknn.data)
            G = igraph.Graph(list(zip(a, b)),
                             directed=False,
                             edge_attrs={'weight': w})
            VxCl = G.community_multilevel(return_levels=False,
                                          weights="weight")
            labels = np.array(VxCl.membership)
            bigs = np.where(np.bincount(labels) >= 10)[0]
            mapping = {k: v for v, k in enumerate(bigs)}
            labels = np.array(
                [mapping[x] if x in bigs else -1 for x in labels])

            # Make labels for excluded cells == -1
            ds.set_attr("Clusters", labels, axis=1)
            n_labels = np.max(labels) + 1
            logging.info("Found " + str(n_labels) + " clusters")

            logging.info("Marker selection")
            (genes, _,
             _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds)
        else:
            genes = self.genes

        temp = np.zeros(ds.shape[0])
        temp[genes] = 1
        ds.set_attr("_Selected", temp, axis=0)
        logging.info("%d genes selected", temp.sum())

        if self.genes is None:
            # Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA
            cells_adjusted = cg.cap_select(labels - labels.min(),
                                           np.arange(n_cells),
                                           int(n_cells * 0.2))
            n_components = min(50, cells_adjusted.shape[0])
            logging.info("PCA projection to %d components", n_components)
            pca = cg.PCAProjection(genes, max_n_components=n_components)
            pca.fit(ds, normalizer, cells=cells_adjusted)
        else:
            n_components = min(50, n_cells)
            logging.info("PCA projection to %d components", n_components)
            pca = cg.PCAProjection(genes, max_n_components=n_components)
            pca.fit(ds, normalizer)

        # Note that here we're transforming all cells; we just did the fit on the selection
        transformed = pca.transform(ds, normalizer)

        k = min(self.k, n_cells - 1)
        logging.info("Generating multiscale KNN graph (k = %d)", k)
        bnn = cg.BalancedKNN(k=k, maxl=2 * k)
        bnn.fit(transformed)
        knn = bnn.kneighbors(mode='connectivity')[1][:, 1:]
        n_cells = knn.shape[0]
        a = np.tile(np.arange(n_cells), k)
        b = np.reshape(knn.T, (n_cells * k, ))
        w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells)
        knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
        threshold = w > 0.05
        mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])),
                                 shape=(n_cells, n_cells))
        mknn = mknn.minimum(mknn.transpose()).tocoo()

        perplexity = min(k, (n_cells - 1) / 3 - 1)
        if self.gtsne:
            logging.info("gt-SNE layout")
            # Note that perplexity argument is ignored in this case, but must still be given
            # because bhtsne will check that it has a valid value
            tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed,
                                                             knn=knn.tocsr())
        else:
            logging.info("t-SNE layout")
            tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed)

        return (knn, mknn, tsne_pos)