def fit_transform(self, ds: loompy.LoomConnection) -> None: # Poisson pooling self.fit(ds) knn = self.knn.astype("bool") logging.debug(f"Poisson pooling") ds["pooled"] = 'int32' if self.compute_velocity and "spliced" in ds.layers: ds["spliced_pooled"] = 'int32' ds["unspliced_pooled"] = 'int32' for (_, indexes, view) in ds.scan(axis=0, layers=["spliced", "unspliced"], what=["layers"]): ds["spliced_pooled"][ indexes.min():indexes.max() + 1, :] = view.layers["spliced"][:, :] @ knn.T ds["unspliced_pooled"][ indexes.min():indexes.max() + 1, :] = view.layers["unspliced"][:, :] @ knn.T ds["pooled"][indexes.min():indexes.max() + 1, :] = ds["spliced_pooled"][indexes.min( ):indexes.max() + 1, :] + ds["unspliced_pooled"][ indexes.min():indexes.max() + 1, :] else: for (_, indexes, view) in ds.scan(axis=0, layers=[""], what=["layers"]): ds["pooled"][indexes.min():indexes.max() + 1, :] = view[:, :] @ knn.T
def fit(self, ds: loompy.LoomConnection, normalizer: cg.Normalizer, cells: np.ndarray = None) -> None: if cells is None: cells = np.fromiter(range(ds.shape[1]), dtype='int') n_cells = cells.shape[0] n_genes = self.genes.shape[0] # Support out-of-order datasets if "Accession" in ds.row_attrs: self.accessions = ds.row_attrs["Accession"] self.pca = IncrementalPCA(n_components=self.n_components) if self.layer is not None: # NOTE TO AVOID a BUG with layer of pickled objects try: for (ix, selection, view) in ds.scan(items=cells, axis=1, layers=[self.layer]): vals = normalizer.transform(view.layers[self.layer][:, :], selection) if self.nng is not None: vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0 self.pca.partial_fit(vals[self.genes, :].transpose()) # PCA on the selected genes except AttributeError: self.layer = None if self.layer is None: for (ix, selection, view) in ds.scan(items=cells, axis=1): vals = normalizer.transform(view[:, :], selection) if self.nng is not None: vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0 self.pca.partial_fit(vals[self.genes, :].transpose()) # PCA on the selected genes
def fit(self, ds: loompy.LoomConnection) -> None: self.sd = np.zeros(ds.shape[0]) self.mu = np.zeros(ds.shape[0]) self.totals = np.zeros(ds.shape[1]) for _, selection, view in ds.scan(axis=0): vals = view[self.layer][:, :].astype("float") self.totals += np.sum(vals, axis=0) self.level = np.median(self.totals) for _, selection, view in ds.scan(axis=0): vals = view[self.layer][:, :].astype("float") # Rescale to the median total UMI count, plus 1 (to avoid log of zero), then log transform vals = np.log2(div0(vals, self.totals) * self.level + 1) self.mu[selection] = np.mean(vals, axis=1) self.sd[selection] = np.std(vals, axis=1)
def transform(self, ds: loompy.LoomConnection, normalizer: cg.Normalizer, cells: np.ndarray = None) -> np.ndarray: if cells is None: cells = np.fromiter(range(ds.shape[1]), dtype='int') n_cells = cells.shape[0] # Support out--of-order datasets if self.accessions is not None: # This is magic sauce for making the order of one list be like another ordering = np.where(ds.row_attrs["Accession"][None, :] == self.accessions[:, None])[1] transformed = np.zeros((cells.shape[0], self.pca.n_components_)) j = 0 if self.layer is not None: # NOTE TO AVOID a BUG with layer of pickled objects try: for (ix, selection, view) in ds.scan(items=cells, axis=1, layers=[self.layer]): vals = normalizer.transform(view.layers[self.layer][:, :], selection) if self.nng is not None: vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0 n_cells_in_batch = selection.shape[0] transformed[j:j + n_cells_in_batch, :] = self.pca.transform(vals[self.genes, :].transpose()) j += n_cells_in_batch except AttributeError: self.layer = None if self.layer is None: for (ix, selection, view) in ds.scan(items=cells, axis=1): vals = normalizer.transform(view[:, :], selection) if self.nng is not None: vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0 n_cells_in_batch = selection.shape[0] transformed[j:j + n_cells_in_batch, :] = self.pca.transform(vals[self.genes, :].transpose()) j += n_cells_in_batch # Must select significant components only once, and reuse for future transformations if self.sigs is None: pvalue_KS = np.zeros(transformed.shape[1]) # pvalue of each component for i in range(1, transformed.shape[1]): (_, pvalue_KS[i]) = ks_2samp(transformed[:, i - 1], transformed[:, i]) self.sigs = np.where(pvalue_KS < 0.1)[0] if len(self.sigs) == 0: self.sigs = (0, 1) transformed = transformed[:, self.sigs] return transformed
def fit_loom(self, ds: loompy.LoomConnection, *, tolayer: str = "enrichment", knn: Union[str, sparse.csr_matrix] = "KNN") -> None: if tolayer not in ds.layers: ds[tolayer] = "float32" if type(knn) is str: knn_matrix = ds.col_graphs[knn].tocsr() else: knn_matrix = knn k = knn_matrix.count_nonzero() / knn_matrix.shape[0] with tqdm(total=ds.shape[0], desc="Neighborhood enrichment") as pbar: for ix, selection, view in ds.scan(axis=0, what=["layers"]): for j in range(view.shape[0]): ds[tolayer][j + ix, :] = self.fit(view[j, :], knn_matrix, k) pbar.update(view.shape[0])
def transform(self, ds: loompy.LoomConnection, normalizer: Normalizer, cells: np.ndarray = None) -> np.ndarray: if cells is None: cells = np.arange(ds.shape[1]) transformed = np.zeros((cells.shape[0], self.pca.n_components_)) j = 0 # Support out-of-order datasets key = None if "Accession" in ds.row_attrs: key = "Accession" layer = self.layer if self.layer is not None else "" for (_, selection, view) in ds.scan(items=cells, axis=1, layers=[layer], key=key): vals = normalizer.transform(view.layers[layer][:, :], selection) n_cells_in_batch = selection.shape[0] transformed[j:j + n_cells_in_batch, :] = self.pca.transform( vals[self.genes, :].transpose()) j += n_cells_in_batch if self.test_significance: # Must select significant components only once, and reuse for future transformations if self.sigs is None: pvalue_KS = np.zeros( transformed.shape[1]) # pvalue of each component for i in range(1, transformed.shape[1]): (_, pvalue_KS[i]) = ks_2samp(transformed[:, i - 1], transformed[:, i]) self.sigs = np.where(pvalue_KS < 0.1)[0] if len(self.sigs) == 0: self.sigs = (0, 1) transformed = transformed[:, self.sigs] if self.batch_keys is not None and len(self.batch_keys) > 0: keys_df = pd.DataFrame.from_dict( {k: ds.ca[k] for k in self.batch_keys}) transformed = harmonize(transformed, keys_df, batch_key=self.batch_keys) return transformed
def fit(self, ds: loompy.LoomConnection) -> np.ndarray: cells = np.where(ds.col_attrs["Clusters"] >= 0)[0] labels = ds.col_attrs["Clusters"][cells] n_labels = np.max(labels) + 1 logging.info("n_labels %d", n_labels) self.trinary_prob = np.empty((ds.shape[0], n_labels)) self.genes = ds.ra.Gene for (ix, selection, view) in ds.scan(axis=0, what=["layers"]): vals = view[:, cells] for j, row in enumerate(selection): data = np.round(vals[j, :]) self.trinary_prob[row, :] = self._betabinomial_trinarize_array( data, labels, self.f, n_labels) return self.trinary_prob
def fit(self, ds: loompy.LoomConnection, normalizer: Normalizer, cells: np.ndarray = None) -> None: if cells is None: cells = np.fromiter(range(ds.shape[1]), dtype='int') # Support out-of-order datasets key = None if "Accession" in ds.row_attrs: key = "Accession" self.pca = IncrementalPCA(n_components=self.n_components) layer = self.layer if self.layer is not None else "" for (_, selection, view) in ds.scan(items=cells, axis=1, layers=[layer], key=key): if len(selection) < self.n_components: continue vals = normalizer.transform(view.layers[layer][:, :], selection) self.pca.partial_fit( vals[self.genes, :].transpose()) # PCA on the selected genes
def fit( self, ds: loompy.LoomConnection ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Returns: knn The knn graph as a sparse matrix mknn Mutual knn subgraph pos 2D projection (gt-SNE) as ndarray with shape (n_cells, 2) """ n_cells = ds.shape[1] logging.info("Processing all %d cells", n_cells) logging.info("Validating genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes logging.info("%d of %d genes were valid", np.sum(ds.ra._Valid == 1), ds.shape[0]) logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd) logging.info("Loading data for selected genes") data = np.zeros((n_cells, genes.shape[0])) for (ix, selection, view) in ds.scan(axis=1): data[selection - ix, :] = view[genes, :].T logging.info("Computing initial subspace KNN") subspaces = np.ones(data.shape) knn = subspace_knn_graph(data, subspaces) mknn = knn.minimum(knn.transpose()).tocoo() for t in range(5): logging.info(f"Refining subspace KNN (iteration {t + 1})") logging.info("Louvain clustering") graph = nx.from_scipy_sparse_matrix(mknn) partitions = community.best_partition(graph) labels = np.array( [partitions[key] for key in range(mknn.shape[0])]) ds.ca.Clusters = labels n_labels = np.max(labels) + 1 logging.info(f"Found {n_labels} clusters") logging.info("Marker selection") (_, enrichment, _) = cg.MarkerSelection(n_markers=10, findq=False).fit(ds) subspaces = np.zeros(data.shape) for ix in range(enrichment.shape[1]): for j in range(n_cells): subspaces[j, np.argsort(-enrichment[:, ix])[:self.n_genes // n_labels]] = 1 knn = subspace_knn_graph(data, subspaces) mknn = knn.minimum(knn.transpose()).tocoo() perplexity = min(self.k, (n_cells - 1) / 3 - 1) logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity).layout(data, knn=knn.tocsr()) return (knn, mknn, tsne_pos)
def fit(self, ds: loompy.LoomConnection) -> None: logging.info("Computing pseudoage") ages = np.array([age_to_num(x) for x in ds.ca.Age]) knn = ds.col_graphs.KNN k = knn.nnz / knn.shape[0] ds.ca.PseudoAge = (knn.astype("bool") @ ages) / k logging.info("Slicing pseudoage") slice_names: List[str] = [] with TemporaryDirectory() as tempfolder: slices = np.percentile(ds.ca.PseudoAge, np.arange(0, 101, 5)) logging.info("Collecting cells") for (ix, _, view) in ds.scan(axis=1): for i in range(len(slices) - 2): s1 = slices[i] s2 = slices[i + 2] slice_name = f"Age{s1:05.2f}to{s2:05.2f}".replace( ".", "") + ".loom" if slice_name not in slice_names: slice_names.append(slice_name) cells = ((view.ca.PseudoAge >= s1) & (view.ca.PseudoAge < s2)) if cells.sum() == 0: continue fname = os.path.join(tempfolder, slice_name) if not os.path.exists(fname): with loompy.new(fname) as dsout: dsout.add_columns(view.layers[:, cells], col_attrs=view.ca[cells], row_attrs=view.ra) else: with loompy.connect(fname) as dsout: dsout.add_columns(view.layers[:, cells], col_attrs=view.ca[cells], row_attrs=view.ra) for slice_name in slice_names: fname = os.path.join(tempfolder, slice_name) logging.info("Cytograph on " + slice_name) with loompy.connect(fname) as ds: Cytograph(config=load_config()).fit(ds) # Use dynamic programming to find the deepest tree (forest), as given by total number of cells along each branch logging.info("Computing pseudolineage") clusters = "Clusters" min_pct = 0.1 # List of matrices giving the bipartite graph between each pair of layers, weighted by number of shared cells overlaps = [] n_nodes = [] # List of number of nodes (clusters) in each layer n_cells = [ ] # List of arrays giving the number of cells in each cluster n_layers = len(slice_names) # Compute the bipartite graphs between layers for t in range(n_layers): # Link clusters from layer t to clusters from layer t + 1 logging.info(f"{slice_names[t]}.loom") with loompy.connect(os.path.join(tempfolder, slice_names[t])) as ds1: n_nodes.append(ds1.ca[clusters].max() + 1) n_cells.append(np.zeros(n_nodes[t])) for c in range(n_nodes[t]): n_cells[t][c] = (ds1.ca[clusters] == c).sum() if t >= n_layers - 1: break with loompy.connect( os.path.join(tempfolder, slice_names[t + 1])) as ds2: overlap = np.zeros( (np.unique(ds1.ca[clusters]).shape[0], np.unique(ds2.ca[clusters]).shape[0]), dtype="int") for i in np.unique(ds1.ca[clusters]): cells1 = ds1.ca.CellID[ds1.ca[clusters] == i] for j in np.unique(ds2.ca[clusters]): cells2 = ds2.ca.CellID[ds2.ca[clusters] == j] overlap[i, j] = np.intersect1d(cells1, cells2).shape[0] overlaps.append(overlap) # List of arrays keeping track of the depth of the deepest tree starting at each node in the layer # Depth defined as sum of the number of shared cells along the branch depths = [np.zeros(n, dtype="int") for n in n_nodes] edges = [ np.zeros(n, dtype="int") for n in n_nodes[1:] ] # List of arrays giving the predecessor of each cluster (or -1 if no predecessor) for t in range(0, n_layers - 1): for i in range(n_nodes[t + 1]): # Now find the widest deepest branch from any node j in layer t to node i in layer t + 1 # Widest, deepest meaning: greatest sum of depth up to node j in layer t plus number of shared cells # But disallowing any branch with less than min_pct % shared cells best_j = -1 best_depth = 0 for j in range(n_nodes[t]): pct_overlapping = 100 * overlaps[t][j, i] / ( n_cells[t][j] + n_cells[t + 1][i]) if pct_overlapping > min_pct: depth = depths[t][j] + overlaps[t][j, i] if depth > best_depth: best_depth = depth best_j = j edges[t][i] = best_j # Now we have # # edges: List of arrays giving the index of the predecessor of each cluster (or -1 if no predecessor exists) # overlaps: List of matrices giving the number of cells shared between clusters in layer t and t + 1 # n_nodes: List of number of nodes (clusters) in each layer # n_cells: List of arrays of number of cells in each node (cluster) # Now position the nodes of each layer such that no edges cross ypositions = [np.arange(n_nodes[0])] for t in range(len(edges)): pos = np.full(n_nodes[t + 1], -1) for i in range(pos.shape[0]): prev = edges[t][i] if (prev) >= 0: pos[i] = ypositions[t][prev] ordering = np.argsort(pos) mapping = dict(zip(ordering, range(len(ordering)))) ypositions.append( np.array([mapping[i] for i in range(len(ordering))])) # Make the positions proportional to the number of cells (cumulative) max_pos = 0 for i, pos in enumerate(ypositions): with loompy.connect(os.path.join(tempfolder, slice_names[i])) as ds0: n_clusters = ds0.ca[clusters].max() + 1 ncells = np.array([(ds0.ca[clusters] == i).sum() for i in range(n_clusters)]) total = 0 new_pos = np.zeros_like(pos) for j in range(len(pos)): cluster = np.where(pos == j)[0] new_pos[cluster] = total + ncells[cluster] / 2 total += ncells[cluster] ypositions[i] = new_pos / 1000 max_pos = max(max_pos, max(ypositions[i])) for i, pos in enumerate(ypositions): ypositions[i] += (max_pos - np.max(pos)) / 2 # Then position the layers properly in time xpositions = [] for i in range(n_layers): with loompy.connect(os.path.join(tempfolder, slice_names[i])) as ds0: xpositions.append(np.mean(ds0.ca.PseudoAge)) # Now project each individual cell to the pseudolineage logging.info("Projecting cells to pseudolineage") cell_to_xy = {} for t in range(len(n_nodes) - 1): with loompy.connect(os.path.join(tempfolder, slice_names[t])) as ds0: with loompy.connect( os.path.join(tempfolder, slice_names[t + 1])) as ds1: for i in range(n_nodes[t + 1]): if edges[t][i] != -1: y1 = ypositions[t][edges[t][i]] y2 = ypositions[t + 1][i] offset = (xpositions[t + 1] - xpositions[t]) / 4 overlapping_cells = (ds1.ca[clusters] == i) & ( ds1.ca.PseudoAge < slices[t + 2]) crs = np.array( CatmullRomSpline( n_points=100).fit_transform( np.array( [[slices[t + 1] - offset, y1], [slices[t + 1], y1], [slices[t + 2], y2], [slices[t + 2] + offset, y2]]))) widths = np.linspace(n_cells[t][edges[t][i]], n_cells[t + 1][i], num=100) / 1500 f = interp1d(crs[:, 0], crs[:, 1], fill_value="extrapolate") fw = interp1d(crs[:, 0], widths, fill_value="extrapolate") y = f( ds1.ca.PseudoAge[overlapping_cells] ) + np.random.normal( scale=fw( ds1.ca.PseudoAge[overlapping_cells]) / 6, size=overlapping_cells.sum()) for i, ix in enumerate( np.where(overlapping_cells)[0]): cell_to_xy[ds1.ca.CellID[ix]] = [ ds1.ca.PseudoAge[ix], y[i] ] # Draw the leftmost pseudoage slice if t == 0: for i in range(n_nodes[0]): y1 = ypositions[0][i] y2 = ypositions[0][i] widths = np.linspace(n_cells[t][i], n_cells[t][i], num=100) / 1500 overlapping_cells = (ds0.ca[clusters] == i) & ( ds0.ca.PseudoAge < slices[1]) y = y1 + np.random.normal( scale=widths[0] / 6, size=overlapping_cells.sum()) for i, ix in enumerate( np.where(overlapping_cells)[0]): cell_to_xy[ds1.ca.CellID[ix]] = [ ds0.ca.PseudoAge[ix], y[i] ] # Draw the rightmost pseudoage slice if t == len(n_nodes) - 2: for i in range(n_nodes[-1]): y1 = ypositions[t][edges[t][i]] y2 = ypositions[t + 1][i] widths = np.linspace(n_cells[t][edges[t][i]], n_cells[t + 1][i], num=100) / 1500 overlapping_cells = (ds1.ca[clusters] == i) & ( ds1.ca.PseudoAge > slices[-2]) y = y2 + np.random.normal( scale=widths[-1] / 6, size=overlapping_cells.sum()) for i, ix in enumerate( np.where(overlapping_cells)[0]): cell_to_xy[ds1.ca.CellID[ix]] = [ ds1.ca.PseudoAge[ix], y[i] ] logging.info( "Saving pseudolineage projection back in original file") logging.info(ds.ca) return cell_to_xy xy = np.zeros((ds.shape[1], 2)) for i, cellid in enumerate(cell_to_xy.keys()): j = np.where(ds.ca.CellID == cellid)[0] xy[j] = cell_to_xy[cellid] ds.ca.PseudoLineage = xy
def aggregate_loom(ds: loompy.LoomConnection, out_file: str, select: np.ndarray, group_by: str, aggr_by: str, aggr_ca_by: Dict[str, str], return_matrix: bool = False) -> np.ndarray: """ Aggregate a Loom file by applying aggregation functions to the main matrix as well as to the column attributes Args: ds The Loom file out_file The name of the output Loom file (will be appended to if it exists) select Bool array giving the columns to include (or None, to include all) group_by The column attribute to group by aggr_by The aggregation function for the main matrix aggr_ca_by The aggregation functions for the column attributes (or None to skip) Remarks: aggr_by gives the aggregation function for the main matrix aggr_ca_by is a dictionary with column attributes as keys and aggregation functionas as values Aggregation functions can be any valid aggregation function from here: https://github.com/ml31415/numpy-groupies In addition, you can specify: "tally" to count the number of occurences of each value of a categorical attribute """ ca = {} # type: Dict[str, np.ndarray] if select is not None: raise ValueError("The 'select' argument is deprecated") labels = (ds.ca[group_by]).astype('int') _, zero_strt_sort_noholes_lbls = np.unique(labels, return_inverse=True) n_groups = len(set(labels)) if aggr_ca_by is not None: for key in ds.col_attrs.keys(): if key not in aggr_ca_by: continue func = aggr_ca_by[key] if func == "tally": for val in set(ds.col_attrs[key]): ca[key + "_" + val] = npg.aggregate( zero_strt_sort_noholes_lbls, (ds.col_attrs[key] == val).astype('int'), func="sum", fill_value=0) elif func == "mode": def mode(x): return scipy.stats.mode(x)[0][0] ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls, ds.col_attrs[key], func=mode, fill_value=0).astype('str') elif func == "mean": ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls, ds.col_attrs[key], func=func, fill_value=0) elif func == "first": ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls, ds.col_attrs[key], func=func, fill_value=ds.col_attrs[key][0]) m = np.empty((ds.shape[0], n_groups)) for (_, selection, view) in ds.scan(axis=0): vals_aggr = npg.aggregate(zero_strt_sort_noholes_lbls, view[:, :], func=aggr_by, axis=1, fill_value=0) m[selection, :] = vals_aggr if return_matrix: return m loompy.create_append(out_file, m, ds.ra, ca, fill_values="auto")