def fit( self, ds: loompy.LoomConnection ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Returns: knn The knn graph as a sparse matrix mknn Mutual knn subgraph pos 2D projection (gt-SNE) as ndarray with shape (n_cells, 2) """ n_cells = ds.shape[1] logging.info("Processing all %d cells", n_cells) logging.info("Validating genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes logging.info("%d of %d genes were valid", np.sum(ds.ra._Valid == 1), ds.shape[0]) logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd) logging.info("Loading data for selected genes") data = np.zeros((n_cells, genes.shape[0])) for (ix, selection, view) in ds.scan(axis=1): data[selection - ix, :] = view[genes, :].T logging.info("Computing initial subspace KNN") subspaces = np.ones(data.shape) knn = subspace_knn_graph(data, subspaces) mknn = knn.minimum(knn.transpose()).tocoo() for t in range(5): logging.info(f"Refining subspace KNN (iteration {t + 1})") logging.info("Louvain clustering") graph = nx.from_scipy_sparse_matrix(mknn) partitions = community.best_partition(graph) labels = np.array( [partitions[key] for key in range(mknn.shape[0])]) ds.ca.Clusters = labels n_labels = np.max(labels) + 1 logging.info(f"Found {n_labels} clusters") logging.info("Marker selection") (_, enrichment, _) = cg.MarkerSelection(n_markers=10, findq=False).fit(ds) subspaces = np.zeros(data.shape) for ix in range(enrichment.shape[1]): for j in range(n_cells): subspaces[j, np.argsort(-enrichment[:, ix])[:self.n_genes // n_labels]] = 1 knn = subspace_knn_graph(data, subspaces) mknn = knn.minimum(knn.transpose()).tocoo() perplexity = min(self.k, (n_cells - 1) / 3 - 1) logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity).layout(data, knn=knn.tocsr()) return (knn, mknn, tsne_pos)
def fit(self, ds: loompy.LoomConnection) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Args: n_genes Number of genes to use for manifold learning (ignored if genes is not None) gtsnse Use graph t-SNE for layout (default: standard tSNE) alpha The scale parameter for multiscale KNN genes List of genes to use for manifold learning Returns: knn The multiscale knn graph as a sparse matrix, with k = 100 mknn Mutual knn subgraph, with k = 20 pos 2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2) """ n_valid = np.sum(ds.col_attrs["_Valid"] == 1) n_total = ds.shape[1] logging.info("%d of %d cells were valid", n_valid, n_total) logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0]) cells = np.where(ds.col_attrs["_Valid"] == 1)[0] logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) if self.filter_cellcycle is not None: cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split()) mask = np.in1d(ds.ra.Gene, cell_cycle_genes) if np.sum(mask) == 0: logging.warn("None cell cycle genes where filtered, check your gene list") else: mask = None if self.genes is None: logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask) temp = np.zeros(ds.shape[0]) temp[genes] = 1 ds.set_attr("_Selected", temp, axis=0) logging.info("%d genes selected", temp.sum()) n_components = min(50, n_valid) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer) pca_transformed = pca.fit_transform(ds, normalizer, cells=cells) transformed = pca_transformed logging.info("Generating KNN graph") k = min(10, n_valid - 1) nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4) nn.fit(transformed) knn = nn.kneighbors_graph(mode='connectivity') knn = knn.tocoo() mknn = knn.minimum(knn.transpose()).tocoo() logging.info("Louvain-Jaccard clustering") lj = cg.LouvainJaccard(resolution=1) labels = lj.fit_predict(knn) # Make labels for excluded cells == -1 labels_all = np.zeros(ds.shape[1], dtype='int') + -1 labels_all[cells] = labels ds.set_attr("Clusters", labels_all, axis=1) n_labels = np.max(labels) + 1 logging.info("Found " + str(n_labels) + " LJ clusters") logging.info("Marker selection") (genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask).fit(ds) else: genes = self.genes temp = np.zeros(ds.shape[0]) temp[genes] = 1 ds.set_attr("_Selected", temp, axis=0) logging.info("%d genes selected", temp.sum()) n_components = min(50, n_valid) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer) pca_transformed = pca.fit_transform(ds, normalizer, cells=cells) transformed = pca_transformed logging.info("Generating KNN graph") k = min(10, n_valid - 1) nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4) nn.fit(transformed) knn = nn.kneighbors_graph(mode='connectivity') knn = knn.tocoo() mknn = knn.minimum(knn.transpose()).tocoo() logging.info("Louvain-Jaccard clustering") lj = cg.LouvainJaccard(resolution=1) labels = lj.fit_predict(knn) # Make labels for excluded cells == -1 labels_all = np.zeros(ds.shape[1], dtype='int') + -1 labels_all[cells] = labels ds.set_attr("Clusters", labels_all, axis=1) n_labels = np.max(labels) + 1 logging.info("Found " + str(n_labels) + " LJ clusters") logging.info("Marker selection") (genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds) # Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA cells_adjusted = cg.cap_select(labels, cells, int(n_valid * 0.2)) n_components = min(50, cells_adjusted.shape[0]) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer, cells=cells_adjusted) # Note that here we're transforming all cells; we just did the fit on the selection transformed = pca.transform(ds, normalizer, cells=cells) k = min(100, n_valid - 1) logging.info("Generating multiscale KNN graph (k = %d)", k) nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4) nn.fit(transformed) knn = nn.kneighbors(return_distance=False) # shape: (n_cells, k) n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k,)) w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.05 mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() perplexity = min(k, (n_valid - 1) / 3 - 1) if self.gtsne: logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed, knn=knn.tocsr()) else: logging.info("t-SNE layout") tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed) tsne_all = np.zeros((ds.shape[1], 2), dtype='int') + np.min(tsne_pos, axis=0) tsne_all[cells] = tsne_pos # Transform back to the full set of cells knn = sparse.coo_matrix((knn.data, (cells[knn.row], cells[knn.col])), shape=(n_total, n_total)) mknn = sparse.coo_matrix((mknn.data, (cells[mknn.row], cells[mknn.col])), shape=(n_total, n_total)) return (knn, mknn, tsne_all)
def compute_tsne(ds, tsne_input, perplexity=100, namespace="", seed=0): np.random.seed(seed) ds.ca[r"{namespace}TSNE"] = cg.TSNE( perplexity=perplexity).layout(tsne_input)
def fit(self, ds: loompy.LoomConnection, initial_pos: np.ndarray = None, nng: np.ndarray = None, blocked_genes: np.ndarray = None) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Args: n_genes Number of genes to use for manifold learning (ignored if genes is not None) gtsnse Use graph t-SNE for layout (default: standard tSNE) alpha The scale parameter for multiscale KNN genes List of genes to use for manifold learning initial_pos Use this initial layout, shape (ds.shape[1], 2) nng Non-neuronal genes, set these to zero in neurons (mask array) blocked_gens Don't use these genes (mask array) Returns: knn The multiscale knn graph as a sparse matrix, with k = 100 mknn Mutual knn subgraph, with k = 20 pos 2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2) """ n_cells = ds.shape[1] logging.info("Processing all %d cells", n_cells) logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0]) logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) if self.filter_cellcycle is not None: cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split()) mask = np.in1d(ds.ra.Gene, cell_cycle_genes) if np.sum(mask) == 0: logging.warn("None cell cycle genes where filtered, check your gene list") else: mask = None if blocked_genes is not None: if mask is None: mask = blocked_genes else: mask = mask & blocked_genes if self.genes is None: logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask) n_components = min(50, n_cells) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca_transformed = pca.fit_transform(ds, normalizer) transformed = pca_transformed logging.info("Generating balanced KNN graph") np.random.seed(0) k = min(self.k, n_cells - 1) bnn = cg.BalancedKNN(k=k, maxl=2 * k, sight_k=2 * k) bnn.fit(transformed) knn = bnn.kneighbors_graph(mode='connectivity') knn = knn.tocoo() mknn = knn.minimum(knn.transpose()).tocoo() logging.info("MKNN-Louvain clustering with outliers") (a, b, w) = (mknn.row, mknn.col, mknn.data) random.seed(13) lj = cg.LouvainJaccard(resolution=1, jaccard=False) labels = lj.fit_predict(knn) bigs = np.where(np.bincount(labels) >= 10)[0] mapping = {k: v for v, k in enumerate(bigs)} labels = np.array([mapping[x] if x in bigs else -1 for x in labels]) n_labels = np.max(labels) + 1 logging.info("Found " + str(n_labels) + " clusters") logging.info("Marker selection") temp = None if "Clusters" in ds.ca: temp = ds.ca.Clusters ds.ca.Clusters = labels - labels.min() (genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask, findq=False).fit(ds) if temp is not None: ds.ca.Clusters = temp else: genes = self.genes temp = np.zeros(ds.shape[0], dtype='bool') temp[genes] = True ds.ra._Selected = temp.astype('int') logging.info("%d genes selected", temp.sum()) if self.genes is None: # Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA cells_adjusted = cg.cap_select(labels - labels.min(), np.arange(n_cells), int(n_cells * 0.2)) n_components = min(50, cells_adjusted.shape[0]) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer, cells=cells_adjusted) else: n_components = min(50, n_cells) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer) # Note that here we're transforming all cells; we just did the fit on the selection transformed = pca.transform(ds, normalizer) k = min(self.k, n_cells - 1) logging.info("Generating multiscale KNN graph (k = %d)", k) bnn = cg.BalancedKNN(k=k, maxl=2 * k, sight_k=2 * k) bnn.fit(transformed) knn = bnn.kneighbors(mode='connectivity')[1][:, 1:] n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k,)) w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.025 mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() perplexity = min(k, (n_cells - 1) / 3 - 1) if self.gtsne: logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity, max_iter=self.max_iter).layout(transformed, knn=knn.tocsr(), initial_pos=initial_pos) else: logging.info("t-SNE layout") tsne_pos = cg.TSNE(perplexity=perplexity, max_iter=self.max_iter).layout(transformed, initial_pos=initial_pos) return (knn, mknn, tsne_pos)
def fit( self, ds: loompy.LoomConnection ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Args: n_genes Number of genes to use for manifold learning (ignored if genes is not None) gtsnse Use graph t-SNE for layout (default: standard tSNE) alpha The scale parameter for multiscale KNN genes List of genes to use for manifold learning Returns: knn The multiscale knn graph as a sparse matrix, with k = 100 mknn Mutual knn subgraph, with k = 20 pos 2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2) """ n_cells = ds.shape[1] logging.info("Processing all %d cells", n_cells) logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0]) logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) if self.filter_cellcycle is not None: cell_cycle_genes = np.array( open(self.filter_cellcycle).read().split()) mask = np.in1d(ds.row_attrs["Gene"], cell_cycle_genes) if np.sum(mask) == 0: logging.warn( "None cell cycle genes where filtered, check your gene list" ) else: mask = None if self.genes is None: logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask) n_components = min(50, n_cells) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca_transformed = pca.fit_transform(ds, normalizer) transformed = pca_transformed logging.info("Generating balanced KNN graph") k = min(self.k, n_cells - 1) bnn = cg.BalancedKNN(k=k, maxl=2 * k) bnn.fit(transformed) knn = bnn.kneighbors_graph(mode='connectivity') knn = knn.tocoo() mknn = knn.minimum(knn.transpose()).tocoo() logging.info("MKNN-Louvain clustering with outliers") (a, b, w) = (mknn.row, mknn.col, mknn.data) G = igraph.Graph(list(zip(a, b)), directed=False, edge_attrs={'weight': w}) VxCl = G.community_multilevel(return_levels=False, weights="weight") labels = np.array(VxCl.membership) bigs = np.where(np.bincount(labels) >= 10)[0] mapping = {k: v for v, k in enumerate(bigs)} labels = np.array( [mapping[x] if x in bigs else -1 for x in labels]) # Make labels for excluded cells == -1 ds.set_attr("Clusters", labels, axis=1) n_labels = np.max(labels) + 1 logging.info("Found " + str(n_labels) + " clusters") logging.info("Marker selection") (genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds) else: genes = self.genes temp = np.zeros(ds.shape[0]) temp[genes] = 1 ds.set_attr("_Selected", temp, axis=0) logging.info("%d genes selected", temp.sum()) if self.genes is None: # Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA cells_adjusted = cg.cap_select(labels - labels.min(), np.arange(n_cells), int(n_cells * 0.2)) n_components = min(50, cells_adjusted.shape[0]) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer, cells=cells_adjusted) else: n_components = min(50, n_cells) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer) # Note that here we're transforming all cells; we just did the fit on the selection transformed = pca.transform(ds, normalizer) k = min(self.k, n_cells - 1) logging.info("Generating multiscale KNN graph (k = %d)", k) bnn = cg.BalancedKNN(k=k, maxl=2 * k) bnn.fit(transformed) knn = bnn.kneighbors(mode='connectivity')[1][:, 1:] n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k, )) w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.05 mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() perplexity = min(k, (n_cells - 1) / 3 - 1) if self.gtsne: logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed, knn=knn.tocsr()) else: logging.info("t-SNE layout") tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed) return (knn, mknn, tsne_pos)
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") ds = loompy.connect(self.input().fn) spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "_Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean", "Bucket": "mode", "Region": "first", "OriginalClusters": "first", "LeafOrder": "first", "Probable_location": "first", "Developmental_compartment": "first", "Description": "first", "Location_based_on": "first", "Neurotransmitter": "first", "LeafOrder": "first", "Comment": "first", "ClusterName": "first", "TaxonomyRank1": "first", "TaxonomyRank2": "first", "TaxonomyRank3": "first", "TaxonomyRank4": "first", "TaxonomySymbol": "first" } cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec) with loompy.connect(out_file) as dsagg: logging.info( "Finding non-neuronal, housekeeping, and troublemaking genes" ) (nng, blocked) = _gene_selection_L5(dsagg) logging.info("Manifold learning on the aggregate file") normalizer = cg.Normalizer(False) normalizer.fit(dsagg) pca = cg.PCAProjection(np.arange(dsagg.shape[1] * 10), max_n_components=50) pca.fit(dsagg, normalizer) transformed = pca.transform(dsagg, normalizer) k = 40 bnn = cg.BalancedKNN(k=k, maxl=2 * k) bnn.fit(transformed) knn = bnn.kneighbors(mode='connectivity')[1][:, 1:] n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k, )) w = np.repeat(1 / np.power(np.arange(1, k + 1), 1.8), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.025 mknn = sparse.coo_matrix( (w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() tsne = cg.TSNE(perplexity=5).layout(transformed) dsagg.col_graphs.KNN = knn dsagg.col_graphs.MKNN = mknn dsagg.ca._X = tsne[:, 0] dsagg.ca._Y = tsne[:, 1] logging.info("Manifold learning on all cells") init = np.zeros((ds.shape[1], 2)) for lbl in np.unique(ds.ca.Clusters): init[ds.ca.Clusters == lbl, :] = tsne[lbl, :] + np.random.normal(size=( (ds.ca.Clusters == lbl).sum(), 2)) ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000) (knn, mknn, tsne) = ml.fit(ds, initial_pos=init, nng=nng, blocked_genes=blocked) ds.col_graphs.KNN = knn ds.col_graphs.MKNN = mknn ds.ca._X = tsne[:, 0] ds.ca._Y = tsne[:, 1] logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root="../auto-annotation/Adolescent/") aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator(n_genes=6).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1)