def fit(self, ds: loompy.LoomConnection, mu: np.ndarray = None, sd: np.ndarray = None, totals: np.ndarray = None) -> None: self.sd = sd self.mu = mu self.totals = totals if mu is None or sd is None: (self.sd, self.mu) = ds.map([np.std, np.mean], axis=0) if totals is None: self.totals = ds.map([np.sum], chunksize=100, axis=1)[0]
def _fit(self, ds: loompy.LoomConnection, labels: np.ndarray) -> np.ndarray: logging.info("Computing enrichment statistic") n_labels = len(np.unique(labels)) n_genes, n_cells = ds.shape # Number of cells per cluster sizes = np.bincount(labels, minlength=n_labels) # Number of nonzero values per cluster nnz = ds.aggregate(None, None, labels, np.count_nonzero, None) # Mean value per cluster means = ds.aggregate(None, None, labels, "mean", None) # Non-zeros and means over all cells (nnz_overall, means_overall) = ds.map([np.count_nonzero, np.mean], axis=0) # Scale by number of cells f_nnz = nnz / sizes f_nnz_overall = nnz_overall / n_cells # Means and fraction non-zero values in other clusters (per cluster) means_other = ((means_overall * n_cells)[None].T - (means * sizes)) / (n_cells - sizes) f_nnz_other = ((f_nnz_overall * n_cells)[None].T - (f_nnz * sizes)) / (n_cells - sizes) # enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01) enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / ( means_other + 0.01) # Select best markers if self.valid_genes is None: logging.info("Identifying valid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] self.valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6) if self.mask is None: excluded = set(np.where(~self.valid_genes)[0]) else: excluded = set(np.where(((~self.valid_genes) | self.mask))[0]) included = np.zeros(n_genes, dtype=bool) for ix in range(n_labels): enriched = np.argsort(enrichment[:, ix])[::-1] n = 0 count = 0 while count < self.n_markers_per_cluster: if enriched[n] in excluded: n += 1 continue included[enriched[n]] = True excluded.add(enriched[n]) n += 1 count += 1 return (included, enrichment, means)
def fit(self, ds: loompy.LoomConnection, plot: str = None) -> np.ndarray: """ Fit a classifier and use it to determine cluster predictive power Args: ds Dataset plot Filename for optional plot Returns: Matrix of classification probabilities, shape (n_cells, n_labels) """ logging.info("Feature selection") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Feature selection") (_, enrichment, _) = cg.MarkerSelection(findq=False, labels_attr="Clusters").fit(ds) genes = np.zeros_like(ds.ra.Gene, dtype=bool) for ix in range(enrichment.shape[1]): genes[np.argsort(-enrichment[:, ix])[:25]] = True logging.info("PCA projection") pca = cg.PCAProjection(genes, max_n_components=50) transformed = pca.fit_transform(ds, normalizer) le = LabelEncoder().fit(ds.ca.ClusterName) self.le = le labels = le.transform(ds.ca.ClusterName) train_X, test_X, train_Y, test_Y = train_test_split(transformed, labels, test_size=0.2) classifier = RandomForestClassifier(max_depth=30) classifier.fit(train_X, train_Y) self.report = classification_report(test_Y, classifier.predict(test_X), target_names=le.classes_) self.proba = classifier.predict_proba(transformed) if plot: agg = npg.aggregate(labels, self.proba, axis=0, func="mean") plt.imshow(agg, cmap="viridis") plt.xticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="vertical", fontsize=7) plt.yticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="horizontal", fontsize=7) plt.xlabel("Predicted cell type") plt.ylabel("Observed cell type") plt.title("Predictive power of cluster identities") cbar = plt.colorbar() cbar.set_label('Average classification probability', rotation=90) plt.savefig(plot, bbox_inches="tight") return self.proba
def fit(self, ds: loompy.LoomConnection, cells: np.ndarray = None, mu: np.ndarray = None, sd: np.ndarray = None, mask: np.ndarray = None) -> np.ndarray: """ Fits a noise model (CV vs mean) Args: ds (LoomConnection): Dataset n_genes (int): number of genes to include cells (ndarray): cells to include when computing mean and CV (or None) mu, std: Precomputed mean and standard deviations (optional) Returns: ndarray of selected genes (list of ints) """ if mu is None or sd is None: (mu, sd) = ds.map((np.mean, np.std), axis=0, selection=cells) if "_Valid" in ds.ra: valid = ds.ra._Valid == 1 else: valid = np.ones(ds.shape[0], dtype='bool') if mask is not None: valid = np.logical_and(valid, np.logical_not(mask)) valid = np.logical_and(valid, ds.row_attrs["Gene"] != "Xist") valid = np.logical_and(valid, ds.row_attrs["Gene"] != "Tsix") valid = valid.astype('int') ok = np.logical_and(mu > 0, sd > 0) cv = sd[ok] / mu[ok] log2_m = np.log2(mu[ok]) log2_cv = np.log2(cv) svr_gamma = 1000. / len(mu[ok]) clf = SVR(gamma=svr_gamma) clf.fit(log2_m[:, np.newaxis], log2_cv) fitted_fun = clf.predict # Score is the relative position with respect of the fitted curve score = log2_cv - fitted_fun(log2_m[:, np.newaxis]) score = score * valid[ok] self.genes = np.where(ok)[0][np.argsort(score)][-self.n_genes:] return self.genes
def fit(self, ds: loompy.LoomConnection) -> None: # Validating genes logging.info("Marking invalid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes with open(os.path.join(self.classified_dir, "genes.txt"), "w") as f: for ix in range(valid_genes.shape[0]): f.write(ds.Accession[ix]) f.write("\t") f.write(str(valid_genes[ix])) f.write("\n") logging.info("Normalization") normalizer = cg.Normalizer(True) normalizer.fit(ds) self.mu = normalizer.mu self.sd = normalizer.sd logging.info("Feature selection") genes = cg.FeatureSelection(2000).fit(ds) logging.info("PCA projection") self.pca = cg.PCAProjection(genes, max_n_components=50) transformed = self.pca.fit_transform(ds, normalizer) self.classes = ds.col_attrs["SubclassAssigned"] self.le = LabelEncoder().fit(self.classes) self.labels = self.le.transform(self.classes) train_X, test_X, train_Y, test_Y = train_test_split(transformed, self.labels, test_size=0.2, random_state=0) self.classifier = SVC(probability=True) self.classifier.fit(train_X, train_Y) with open(os.path.join(self.classified_dir, "performance.txt"), "w") as f: f.write( classification_report(test_Y, self.classifier.predict(test_X), target_names=self.le.classes_))
def fit( self, ds: loompy.LoomConnection ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Returns: knn The knn graph as a sparse matrix mknn Mutual knn subgraph pos 2D projection (gt-SNE) as ndarray with shape (n_cells, 2) """ n_cells = ds.shape[1] logging.info("Processing all %d cells", n_cells) logging.info("Validating genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes logging.info("%d of %d genes were valid", np.sum(ds.ra._Valid == 1), ds.shape[0]) logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd) logging.info("Loading data for selected genes") data = np.zeros((n_cells, genes.shape[0])) for (ix, selection, view) in ds.scan(axis=1): data[selection - ix, :] = view[genes, :].T logging.info("Computing initial subspace KNN") subspaces = np.ones(data.shape) knn = subspace_knn_graph(data, subspaces) mknn = knn.minimum(knn.transpose()).tocoo() for t in range(5): logging.info(f"Refining subspace KNN (iteration {t + 1})") logging.info("Louvain clustering") graph = nx.from_scipy_sparse_matrix(mknn) partitions = community.best_partition(graph) labels = np.array( [partitions[key] for key in range(mknn.shape[0])]) ds.ca.Clusters = labels n_labels = np.max(labels) + 1 logging.info(f"Found {n_labels} clusters") logging.info("Marker selection") (_, enrichment, _) = cg.MarkerSelection(n_markers=10, findq=False).fit(ds) subspaces = np.zeros(data.shape) for ix in range(enrichment.shape[1]): for j in range(n_cells): subspaces[j, np.argsort(-enrichment[:, ix])[:self.n_genes // n_labels]] = 1 knn = subspace_knn_graph(data, subspaces) mknn = knn.minimum(knn.transpose()).tocoo() perplexity = min(self.k, (n_cells - 1) / 3 - 1) logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity).layout(data, knn=knn.tocsr()) return (knn, mknn, tsne_pos)
def _fit(self, ds: loompy.LoomConnection) -> Tuple[np.ndarray, np.ndarray]: """ Finds n_markers genes per cluster using enrichment score Args: ds (LoomConnection): Dataset Returns: ndarray of selected genes (list of ints) ndarray of enrichment scores """ labels = ds.ca[self.labels_attr] n_labels = max(labels) + 1 n_cells = ds.shape[1] # Number of cells per cluster sizes = np.bincount(labels, minlength=n_labels) # Number of nonzero values per cluster nnz = cg.aggregate_loom(ds, None, None, self.labels_attr, np.count_nonzero, None, return_matrix=True) # Mean value per cluster means = cg.aggregate_loom(ds, None, None, self.labels_attr, "mean", None, return_matrix=True) # Non-zeros and means over all cells (nnz_overall, means_overall) = ds.map([np.count_nonzero, np.mean], axis=0) # Scale by number of cells f_nnz = nnz / sizes f_nnz_overall = nnz_overall / n_cells # Means and fraction non-zero values in other clusters (per cluster) means_other = ((means_overall * n_cells)[None].T - (means * sizes)) / (n_cells - sizes) f_nnz_other = ((f_nnz_overall * n_cells)[None].T - (f_nnz * sizes)) / (n_cells - sizes) # enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01) enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / (means_other + 0.01) # Select best markers if "_Valid" not in ds.ra: logging.info("Recomputing the list of valid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6) ds.ra._Valid = valid_genes.astype('int') included: List[int] = [] if self.mask is None: excluded = set(np.where(ds.row_attrs["_Valid"] == 0)[0]) else: excluded = set(np.where(np.logical_and(ds.row_attrs["_Valid"] == 0, self.mask))[0]) for ix in range(max(labels) + 1): enriched = np.argsort(enrichment[:, ix])[::-1] n = 0 count = 0 while count < self.n_markers: if enriched[n] in excluded: n += 1 continue included.append(enriched[n]) excluded.add(enriched[n]) n += 1 count += 1 return (np.array(included), enrichment)
def fit(self, ds: loompy.LoomConnection) -> None: logging.info(f"Normalizing and selecting {self.n_genes} genes") normalizer = Normalizer(False) normalizer.fit(ds) genes = FeatureSelectionByVariance(self.n_genes, mask=self.mask).fit(ds) self.genes = genes if self.factorization == 'PCA' or self.factorization == 'both' or self.batch_keys is not None: factorization = "PCA" else: factorization = "HPF" if factorization == "PCA": n_components = min(50, ds.shape[1]) logging.info("PCA projection to %d components", n_components) pca = PCA(genes, max_n_components=self.n_factors, test_significance=False, batch_keys=self.batch_keys) transformed = pca.fit_transform(ds, normalizer) else: data = ds.sparse(rows=genes).T # Subsample to lowest number of UMIs if "TotalUMI" in ds.ca: totals = ds.ca.TotalUMI else: totals = ds.map([np.sum], axis=1)[0] min_umis = int(np.min(totals)) logging.debug(f"Subsampling to {min_umis} UMIs") temp = data.toarray() for c in range(temp.shape[0]): temp[c, :] = np.random.binomial(temp[c, :].astype('int32'), min_umis / totals[c]) data = sparse.coo_matrix(temp) # HPF factorization hpf = HPF(k=self.n_factors, validation_fraction=0.05, min_iter=10, max_iter=200, compute_X_ppv=False, n_threads=self.n_threads) hpf.fit(data) transformed = ( hpf.theta.T / hpf.theta.sum(axis=1) ).T # Normalize so the sums are one because JSD requires it # KNN in latent space logging.info(f"Computing KNN (k={self.k_pooling}) in latent space") with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=NumbaPerformanceWarning ) # Suppress warnings about numba not being able to parallelize code warnings.simplefilter( "ignore", category=NumbaPendingDeprecationWarning ) # Suppress warnings about future deprecations warnings.simplefilter( "ignore", category=SparseEfficiencyWarning ) # Suppress warnings about setting the diagonal to 1 nn = NNDescent(data=transformed, metric=(jensen_shannon_distance if factorization == "HPF" else "euclidean")) indices, distances = nn.query(transformed, k=self.k_pooling) # Note: we convert distances to similarities here, to support Poisson smoothing below knn = sparse.csr_matrix( (np.ravel(distances), np.ravel(indices), np.arange(0, distances.shape[0] * distances.shape[1] + 1, distances.shape[1])), (transformed.shape[0], transformed.shape[0])) max_d = knn.data.max() knn.data = (max_d - knn.data) / max_d knn.setdiag( 1 ) # This causes a sparse efficiency warning, but it's not a slow step relative to everything else self.knn = knn
def fit(self, ds: loompy.LoomConnection) -> None: logging.info(f"Running cytograph on {ds.shape[1]} cells") if self.config.params.factorization not in ["PCA", "HPF", "both"]: raise ValueError( "params.factorization must be either 'PCA' or 'HPF' or 'both'") if self.config.params.features not in ["enrichment", "variance"]: raise ValueError( "params.features must be either 'enrichment' or 'variance'") if self.config.params.nn_space not in ["PCA", "HPF", "auto"]: raise ValueError( "params.nn_space must be either 'PCA' or 'HPF' or 'auto'") if not ((self.config.params.nn_space in ["PCA", "auto"] and self.config.params.factorization in ["PCA", "both"]) or (self.config.params.nn_space in ["HPF", "auto"] and self.config.params.factorization in ["HPF", "both"])): raise ValueError( f"config.params.nn_space = '{self.config.params.nn_space}' is incompatible with config.params.factorization = '{self.config.params.factorization}'" ) species = Species.detect(ds) logging.info(f"Species is '{species.name}'") logging.info("Recomputing the list of valid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = (nnz > 10) & (nnz < ds.shape[1] * 0.6) ds.ra.Valid = valid_genes.astype('int') # Perform Poisson pooling if requested main_layer = "" if "poisson_pooling" in self.config.steps: logging.info( f"Poisson pooling with k_pooling == {self.config.params.k_pooling}" ) main_layer = "pooled" # if not in config.steps, use the main layer pp = PoissonPooling(self.config.params.k_pooling, self.config.params.n_genes, compute_velocity=False, n_threads=self.config.execution.n_cpus, factorization=self.config.params.factorization, batch_keys=self.config.params.batch_keys) pp.fit_transform(ds) # Select features if self.config.params.features == "enrichment": logging.info( f"Feature selection by enrichment on preliminary clusters") with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=NumbaPerformanceWarning ) # Suppress warnings about numba not being able to parallelize code warnings.simplefilter( "ignore", category=NumbaPendingDeprecationWarning ) # Suppress warnings about future deprecations warnings.simplefilter( "ignore", category=SparseEfficiencyWarning ) # Suppress warnings about setting the diagonal to 1 logging.info(f" Gene selection for PCA") genes = FeatureSelectionByVariance( self.config.params.n_genes, mask=Species.mask(ds, self.config.params.mask)).fit(ds) logging.info(f" Factorization by PCA") normalizer = Normalizer(False) normalizer.fit(ds) logging.info(" PCA projection to %d components", self.config.params.n_factors) pca = PCA(genes, max_n_components=self.config.params.n_factors, layer=main_layer, test_significance=False, batch_keys=self.config.params.batch_keys) transformed = pca.fit_transform(ds, normalizer) logging.info( f" Computing KNN (k={self.config.params.k}) in PCA space") nn = NNDescent(data=transformed, metric="euclidean") indices, distances = nn.query(transformed, k=self.config.params.k) indices = indices[:, 1:] distances = distances[:, 1:] knn = sparse.csr_matrix( (np.ravel(distances), np.ravel(indices), np.arange(0, distances.shape[0] * distances.shape[1] + 1, distances.shape[1])), (transformed.shape[0], transformed.shape[0])) g = nx.from_scipy_sparse_matrix(knn) partitions = community.best_partition(g, resolution=1, randomize=False) ds.ca.Clusters = np.array( [partitions[key] for key in range(knn.shape[0])]) n_labels = ds.ca.Clusters.max() + 1 genes = FeatureSelectionByEnrichment( int(self.config.params.n_genes // n_labels), Species.mask(ds, self.config.params.mask), findq=False).select(ds) elif self.config.params.features == "variance": logging.info(f"Feature selection by variance") genes = FeatureSelectionByVariance( self.config.params.n_genes, main_layer, Species.mask(ds, self.config.params.mask)).select(ds) logging.info(f"Selected {genes.sum()} genes") if self.config.params.factorization in ['PCA', 'both']: logging.info(f"Factorization by PCA") normalizer = Normalizer(False) normalizer.fit(ds) n_components = min(self.config.params.n_factors, ds.shape[1]) logging.info(" PCA projection to %d components", n_components) pca = PCA(genes, max_n_components=n_components, layer=main_layer, test_significance=False, batch_keys=self.config.params.batch_keys) ds.ca.PCA = pca.fit_transform(ds, normalizer) if self.config.params.factorization in ['HPF', 'both']: logging.info(f"Factorization by HPF") # Load the data for the selected genes data = ds[main_layer].sparse(rows=genes).T logging.debug(f" Data shape is {data.shape}") # HPF factorization hpf = HPF(k=self.config.params.n_factors, validation_fraction=0.05, min_iter=10, max_iter=200, compute_X_ppv=False, n_threads=self.config.execution.n_cpus) hpf.fit(data) beta_all = np.zeros((ds.shape[0], hpf.beta.shape[1])) beta_all[genes] = hpf.beta # Save the unnormalized factors ds.ra.HPF_beta = beta_all ds.ca.HPF_theta = hpf.theta # Here we normalize so the sums over components are one, because JSD requires it # and because otherwise the components will be exactly proportional to cell size theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T beta = (hpf.beta.T / hpf.beta.sum(axis=1)).T beta_all[genes] = beta # Save the normalized factors ds.ra.HPF = beta_all ds.ca.HPF = theta if "nn" in self.config.steps or "clustering" in self.config.steps: if self.config.params.nn_space in ["PCA", "auto" ] and "PCA" in ds.ca: transformed = ds.ca.PCA metric = "euclidean" elif self.config.params.nn_space in ["HPF", "auto" ] and "HPF" in ds.ca: transformed = ds.ca.HPF metric = "js" logging.info( f"Computing balanced KNN (k = {self.config.params.k}) in {self.config.params.nn_space} space using the '{metric}' metric" ) bnn = BalancedKNN(k=self.config.params.k, metric=metric, maxl=2 * self.config.params.k, sight_k=2 * self.config.params.k, n_jobs=-1) bnn.fit(transformed) knn = bnn.kneighbors_graph(mode='distance') knn.eliminate_zeros() mknn = knn.minimum(knn.transpose()) # Convert distances to similarities max_d = knn.data.max() knn.data = (max_d - knn.data) / max_d mknn.data = (max_d - mknn.data) / max_d ds.col_graphs.KNN = knn ds.col_graphs.MKNN = mknn mknn = mknn.tocoo() mknn.setdiag(0) # Compute the effective resolution d = 1 - knn.data radius = np.percentile(d, 90) logging.info(f" 90th percentile radius: {radius:.02}") ds.attrs.radius = radius inside = mknn.data > 1 - radius rnn = sparse.coo_matrix( (mknn.data[inside], (mknn.row[inside], mknn.col[inside])), shape=mknn.shape) ds.col_graphs.RNN = rnn if "embeddings" in self.config.steps or "clustering" in self.config.steps: logging.info(f"Computing 2D and 3D embeddings from latent space") metric_f = ( jensen_shannon_distance if metric == "js" else metric ) # Replace js with the actual function, since OpenTSNE doesn't understand js logging.info(f" Art of tSNE with {metric} distance metric") ds.ca.TSNE = np.array( art_of_tsne(transformed, metric=metric_f) ) # art_of_tsne returns a TSNEEmbedding, which can be cast to an ndarray (its actually just a subclass) logging.info(f" UMAP with {metric} distance metric") ds.ca.UMAP = UMAP(n_components=2, metric=metric_f, n_neighbors=self.config.params.k // 2, learning_rate=0.3, min_dist=0.25).fit_transform(transformed) ds.ca.UMAP3D = UMAP(n_components=3, metric=metric_f, n_neighbors=self.config.params.k // 2, learning_rate=0.3, min_dist=0.25).fit_transform(transformed) if "clustering" in self.config.steps: logging.info("Clustering by polished Louvain") pl = PolishedLouvain(outliers=False, graph="RNN", embedding="TSNE") labels = pl.fit_predict(ds) ds.ca.ClustersModularity = labels + min(labels) ds.ca.OutliersModularity = (labels == -1).astype('int') logging.info("Clustering by polished Surprise") ps = PolishedSurprise(graph="RNN", embedding="TSNE") labels = ps.fit_predict(ds) ds.ca.ClustersSurprise = labels + min(labels) ds.ca.OutliersSurprise = (labels == -1).astype('int') if self.config.params.clusterer == "louvain": ds.ca.Clusters = ds.ca.ClustersModularity ds.ca.Outliers = ds.ca.OutliersModularity else: ds.ca.Clusters = ds.ca.ClustersSurprise ds.ca.Outliers = ds.ca.OutliersSurprise logging.info(f"Found {ds.ca.Clusters.max() + 1} clusters") if species.name in ["H**o sapiens", "Mus musculus"]: logging.info(f"Inferring cell cycle") CellCycleAnnotator(species).annotate(ds)