示例#1
0
    def fit_transform(self, ds: loompy.LoomConnection) -> None:
        # Poisson pooling
        self.fit(ds)
        knn = self.knn.astype("bool")

        logging.debug(f"Poisson pooling")
        ds["pooled"] = 'int32'
        if self.compute_velocity and "spliced" in ds.layers:
            ds["spliced_pooled"] = 'int32'
            ds["unspliced_pooled"] = 'int32'
            for (_, indexes, view) in ds.scan(axis=0,
                                              layers=["spliced", "unspliced"],
                                              what=["layers"]):
                ds["spliced_pooled"][
                    indexes.min():indexes.max() +
                    1, :] = view.layers["spliced"][:, :] @ knn.T
                ds["unspliced_pooled"][
                    indexes.min():indexes.max() +
                    1, :] = view.layers["unspliced"][:, :] @ knn.T
                ds["pooled"][indexes.min():indexes.max() +
                             1, :] = ds["spliced_pooled"][indexes.min(
                             ):indexes.max() + 1, :] + ds["unspliced_pooled"][
                                 indexes.min():indexes.max() + 1, :]
        else:
            for (_, indexes, view) in ds.scan(axis=0,
                                              layers=[""],
                                              what=["layers"]):
                ds["pooled"][indexes.min():indexes.max() +
                             1, :] = view[:, :] @ knn.T
示例#2
0
	def fit(self, ds: loompy.LoomConnection, normalizer: cg.Normalizer, cells: np.ndarray = None) -> None:
		if cells is None:
			cells = np.fromiter(range(ds.shape[1]), dtype='int')
		n_cells = cells.shape[0]
		n_genes = self.genes.shape[0]

		# Support out-of-order datasets
		if "Accession" in ds.row_attrs:
			self.accessions = ds.row_attrs["Accession"]

		self.pca = IncrementalPCA(n_components=self.n_components)
		if self.layer is not None:
			# NOTE TO AVOID a BUG with layer of pickled objects
			try:
				for (ix, selection, view) in ds.scan(items=cells, axis=1, layers=[self.layer]):
					vals = normalizer.transform(view.layers[self.layer][:, :], selection)
					if self.nng is not None:
						vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0
					self.pca.partial_fit(vals[self.genes, :].transpose())		# PCA on the selected genes
			except AttributeError:
				self.layer = None
		if self.layer is None:
			for (ix, selection, view) in ds.scan(items=cells, axis=1):
				vals = normalizer.transform(view[:, :], selection)
				if self.nng is not None:
					vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0
				self.pca.partial_fit(vals[self.genes, :].transpose())		# PCA on the selected genes
示例#3
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        self.sd = np.zeros(ds.shape[0])
        self.mu = np.zeros(ds.shape[0])
        self.totals = np.zeros(ds.shape[1])

        for _, selection, view in ds.scan(axis=0):
            vals = view[self.layer][:, :].astype("float")
            self.totals += np.sum(vals, axis=0)
        self.level = np.median(self.totals)

        for _, selection, view in ds.scan(axis=0):
            vals = view[self.layer][:, :].astype("float")
            # Rescale to the median total UMI count, plus 1 (to avoid log of zero), then log transform
            vals = np.log2(div0(vals, self.totals) * self.level + 1)
            self.mu[selection] = np.mean(vals, axis=1)
            self.sd[selection] = np.std(vals, axis=1)
示例#4
0
	def transform(self, ds: loompy.LoomConnection, normalizer: cg.Normalizer, cells: np.ndarray = None) -> np.ndarray:
		if cells is None:
			cells = np.fromiter(range(ds.shape[1]), dtype='int')
		n_cells = cells.shape[0]

		# Support out--of-order datasets
		if self.accessions is not None:
			# This is magic sauce for making the order of one list be like another
			ordering = np.where(ds.row_attrs["Accession"][None, :] == self.accessions[:, None])[1]

		transformed = np.zeros((cells.shape[0], self.pca.n_components_))
		j = 0

		if self.layer is not None:
			# NOTE TO AVOID a BUG with layer of pickled objects
			try:
				for (ix, selection, view) in ds.scan(items=cells, axis=1, layers=[self.layer]):
					vals = normalizer.transform(view.layers[self.layer][:, :], selection)
					if self.nng is not None:
						vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0
					n_cells_in_batch = selection.shape[0]
					transformed[j:j + n_cells_in_batch, :] = self.pca.transform(vals[self.genes, :].transpose())
					j += n_cells_in_batch
			except AttributeError:
				self.layer = None
		if self.layer is None:
			for (ix, selection, view) in ds.scan(items=cells, axis=1):
				vals = normalizer.transform(view[:, :], selection)
				if self.nng is not None:
					vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0
				n_cells_in_batch = selection.shape[0]
				transformed[j:j + n_cells_in_batch, :] = self.pca.transform(vals[self.genes, :].transpose())
				j += n_cells_in_batch

		# Must select significant components only once, and reuse for future transformations
		if self.sigs is None:
			pvalue_KS = np.zeros(transformed.shape[1])  # pvalue of each component
			for i in range(1, transformed.shape[1]):
				(_, pvalue_KS[i]) = ks_2samp(transformed[:, i - 1], transformed[:, i])
			self.sigs = np.where(pvalue_KS < 0.1)[0]
			if len(self.sigs) == 0:
				self.sigs = (0, 1)

		transformed = transformed[:, self.sigs]

		return transformed
示例#5
0
	def fit_loom(self, ds: loompy.LoomConnection, *, tolayer: str = "enrichment", knn: Union[str, sparse.csr_matrix] = "KNN") -> None:
		if tolayer not in ds.layers:
			ds[tolayer] = "float32"
		if type(knn) is str:
			knn_matrix = ds.col_graphs[knn].tocsr()
		else:
			knn_matrix = knn
		k = knn_matrix.count_nonzero() / knn_matrix.shape[0]
		with tqdm(total=ds.shape[0], desc="Neighborhood enrichment") as pbar:
			for ix, selection, view in ds.scan(axis=0, what=["layers"]):
				for j in range(view.shape[0]):
					ds[tolayer][j + ix, :] = self.fit(view[j, :], knn_matrix, k)
				pbar.update(view.shape[0])
示例#6
0
    def transform(self,
                  ds: loompy.LoomConnection,
                  normalizer: Normalizer,
                  cells: np.ndarray = None) -> np.ndarray:
        if cells is None:
            cells = np.arange(ds.shape[1])

        transformed = np.zeros((cells.shape[0], self.pca.n_components_))
        j = 0

        # Support out-of-order datasets
        key = None
        if "Accession" in ds.row_attrs:
            key = "Accession"

        layer = self.layer if self.layer is not None else ""
        for (_, selection, view) in ds.scan(items=cells,
                                            axis=1,
                                            layers=[layer],
                                            key=key):
            vals = normalizer.transform(view.layers[layer][:, :], selection)
            n_cells_in_batch = selection.shape[0]
            transformed[j:j + n_cells_in_batch, :] = self.pca.transform(
                vals[self.genes, :].transpose())
            j += n_cells_in_batch

        if self.test_significance:
            # Must select significant components only once, and reuse for future transformations
            if self.sigs is None:
                pvalue_KS = np.zeros(
                    transformed.shape[1])  # pvalue of each component
                for i in range(1, transformed.shape[1]):
                    (_, pvalue_KS[i]) = ks_2samp(transformed[:, i - 1],
                                                 transformed[:, i])
                self.sigs = np.where(pvalue_KS < 0.1)[0]
                if len(self.sigs) == 0:
                    self.sigs = (0, 1)

            transformed = transformed[:, self.sigs]

        if self.batch_keys is not None and len(self.batch_keys) > 0:
            keys_df = pd.DataFrame.from_dict(
                {k: ds.ca[k]
                 for k in self.batch_keys})
            transformed = harmonize(transformed,
                                    keys_df,
                                    batch_key=self.batch_keys)
        return transformed
示例#7
0
    def fit(self, ds: loompy.LoomConnection) -> np.ndarray:
        cells = np.where(ds.col_attrs["Clusters"] >= 0)[0]
        labels = ds.col_attrs["Clusters"][cells]
        n_labels = np.max(labels) + 1
        logging.info("n_labels %d", n_labels)
        self.trinary_prob = np.empty((ds.shape[0], n_labels))
        self.genes = ds.ra.Gene

        for (ix, selection, view) in ds.scan(axis=0, what=["layers"]):
            vals = view[:, cells]
            for j, row in enumerate(selection):
                data = np.round(vals[j, :])
                self.trinary_prob[row, :] = self._betabinomial_trinarize_array(
                    data, labels, self.f, n_labels)

        return self.trinary_prob
示例#8
0
    def fit(self,
            ds: loompy.LoomConnection,
            normalizer: Normalizer,
            cells: np.ndarray = None) -> None:
        if cells is None:
            cells = np.fromiter(range(ds.shape[1]), dtype='int')

        # Support out-of-order datasets
        key = None
        if "Accession" in ds.row_attrs:
            key = "Accession"

        self.pca = IncrementalPCA(n_components=self.n_components)
        layer = self.layer if self.layer is not None else ""
        for (_, selection, view) in ds.scan(items=cells,
                                            axis=1,
                                            layers=[layer],
                                            key=key):
            if len(selection) < self.n_components:
                continue
            vals = normalizer.transform(view.layers[layer][:, :], selection)
            self.pca.partial_fit(
                vals[self.genes, :].transpose())  # PCA on the selected genes
示例#9
0
    def fit(
        self, ds: loompy.LoomConnection
    ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
        """
		Discover the manifold

		Returns:
			knn		The knn graph as a sparse matrix
			mknn	Mutual knn subgraph
			pos		2D projection (gt-SNE) as ndarray with shape (n_cells, 2)
		"""
        n_cells = ds.shape[1]
        logging.info("Processing all %d cells", n_cells)
        logging.info("Validating genes")
        nnz = ds.map([np.count_nonzero], axis=0)[0]
        valid_genes = np.logical_and(nnz > 5,
                                     nnz < ds.shape[1] * 0.5).astype("int")
        ds.ra._Valid = valid_genes
        logging.info("%d of %d genes were valid", np.sum(ds.ra._Valid == 1),
                     ds.shape[0])

        logging.info("Normalization")
        normalizer = cg.Normalizer(False)
        normalizer.fit(ds)
        logging.info("Selecting up to %d genes", self.n_genes)
        genes = cg.FeatureSelection(self.n_genes).fit(ds,
                                                      mu=normalizer.mu,
                                                      sd=normalizer.sd)

        logging.info("Loading data for selected genes")
        data = np.zeros((n_cells, genes.shape[0]))
        for (ix, selection, view) in ds.scan(axis=1):
            data[selection - ix, :] = view[genes, :].T

        logging.info("Computing initial subspace KNN")
        subspaces = np.ones(data.shape)
        knn = subspace_knn_graph(data, subspaces)
        mknn = knn.minimum(knn.transpose()).tocoo()

        for t in range(5):
            logging.info(f"Refining subspace KNN (iteration {t + 1})")

            logging.info("Louvain clustering")
            graph = nx.from_scipy_sparse_matrix(mknn)
            partitions = community.best_partition(graph)
            labels = np.array(
                [partitions[key] for key in range(mknn.shape[0])])
            ds.ca.Clusters = labels
            n_labels = np.max(labels) + 1
            logging.info(f"Found {n_labels} clusters")

            logging.info("Marker selection")
            (_, enrichment, _) = cg.MarkerSelection(n_markers=10,
                                                    findq=False).fit(ds)
            subspaces = np.zeros(data.shape)
            for ix in range(enrichment.shape[1]):
                for j in range(n_cells):
                    subspaces[j,
                              np.argsort(-enrichment[:, ix])[:self.n_genes //
                                                             n_labels]] = 1
            knn = subspace_knn_graph(data, subspaces)
            mknn = knn.minimum(knn.transpose()).tocoo()

        perplexity = min(self.k, (n_cells - 1) / 3 - 1)
        logging.info("gt-SNE layout")
        # Note that perplexity argument is ignored in this case, but must still be given
        # because bhtsne will check that it has a valid value
        tsne_pos = cg.TSNE(perplexity=perplexity).layout(data, knn=knn.tocsr())

        return (knn, mknn, tsne_pos)
示例#10
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        logging.info("Computing pseudoage")
        ages = np.array([age_to_num(x) for x in ds.ca.Age])
        knn = ds.col_graphs.KNN
        k = knn.nnz / knn.shape[0]
        ds.ca.PseudoAge = (knn.astype("bool") @ ages) / k

        logging.info("Slicing pseudoage")
        slice_names: List[str] = []
        with TemporaryDirectory() as tempfolder:
            slices = np.percentile(ds.ca.PseudoAge, np.arange(0, 101, 5))
            logging.info("Collecting cells")
            for (ix, _, view) in ds.scan(axis=1):
                for i in range(len(slices) - 2):
                    s1 = slices[i]
                    s2 = slices[i + 2]
                    slice_name = f"Age{s1:05.2f}to{s2:05.2f}".replace(
                        ".", "") + ".loom"
                    if slice_name not in slice_names:
                        slice_names.append(slice_name)
                    cells = ((view.ca.PseudoAge >= s1) &
                             (view.ca.PseudoAge < s2))
                    if cells.sum() == 0:
                        continue
                    fname = os.path.join(tempfolder, slice_name)
                    if not os.path.exists(fname):
                        with loompy.new(fname) as dsout:
                            dsout.add_columns(view.layers[:, cells],
                                              col_attrs=view.ca[cells],
                                              row_attrs=view.ra)
                    else:
                        with loompy.connect(fname) as dsout:
                            dsout.add_columns(view.layers[:, cells],
                                              col_attrs=view.ca[cells],
                                              row_attrs=view.ra)

            for slice_name in slice_names:
                fname = os.path.join(tempfolder, slice_name)
                logging.info("Cytograph on " + slice_name)
                with loompy.connect(fname) as ds:
                    Cytograph(config=load_config()).fit(ds)

            # Use dynamic programming to find the deepest tree (forest), as given by total number of cells along each branch
            logging.info("Computing pseudolineage")
            clusters = "Clusters"
            min_pct = 0.1

            # List of matrices giving the bipartite graph between each pair of layers, weighted by number of shared cells
            overlaps = []
            n_nodes = []  # List of number of nodes (clusters) in each layer
            n_cells = [
            ]  # List of arrays giving the number of cells in each cluster
            n_layers = len(slice_names)

            # Compute the bipartite graphs between layers
            for t in range(n_layers):
                # Link clusters from layer t to clusters from layer t + 1
                logging.info(f"{slice_names[t]}.loom")
                with loompy.connect(os.path.join(tempfolder,
                                                 slice_names[t])) as ds1:
                    n_nodes.append(ds1.ca[clusters].max() + 1)
                    n_cells.append(np.zeros(n_nodes[t]))
                    for c in range(n_nodes[t]):
                        n_cells[t][c] = (ds1.ca[clusters] == c).sum()
                    if t >= n_layers - 1:
                        break
                    with loompy.connect(
                            os.path.join(tempfolder,
                                         slice_names[t + 1])) as ds2:
                        overlap = np.zeros(
                            (np.unique(ds1.ca[clusters]).shape[0],
                             np.unique(ds2.ca[clusters]).shape[0]),
                            dtype="int")
                        for i in np.unique(ds1.ca[clusters]):
                            cells1 = ds1.ca.CellID[ds1.ca[clusters] == i]
                            for j in np.unique(ds2.ca[clusters]):
                                cells2 = ds2.ca.CellID[ds2.ca[clusters] == j]
                                overlap[i, j] = np.intersect1d(cells1,
                                                               cells2).shape[0]
                        overlaps.append(overlap)

            # List of arrays keeping track of the depth of the deepest tree starting at each node in the layer
            # Depth defined as sum of the number of shared cells along the branch
            depths = [np.zeros(n, dtype="int") for n in n_nodes]
            edges = [
                np.zeros(n, dtype="int") for n in n_nodes[1:]
            ]  # List of arrays giving the predecessor of each cluster (or -1 if no predecessor)
            for t in range(0, n_layers - 1):
                for i in range(n_nodes[t + 1]):
                    # Now find the widest deepest branch from any node j in layer t to node i in layer t + 1
                    # Widest, deepest meaning: greatest sum of depth up to node j in layer t plus number of shared cells
                    # But disallowing any branch with less than min_pct % shared cells
                    best_j = -1
                    best_depth = 0
                    for j in range(n_nodes[t]):
                        pct_overlapping = 100 * overlaps[t][j, i] / (
                            n_cells[t][j] + n_cells[t + 1][i])
                        if pct_overlapping > min_pct:
                            depth = depths[t][j] + overlaps[t][j, i]
                            if depth > best_depth:
                                best_depth = depth
                                best_j = j
                    edges[t][i] = best_j

            # Now we have
            #
            # edges:    List of arrays giving the index of the predecessor of each cluster (or -1 if no predecessor exists)
            # overlaps: List of matrices giving the number of cells shared between clusters in layer t and t + 1
            # n_nodes:  List of number of nodes (clusters) in each layer
            # n_cells:  List of arrays of number of cells in each node (cluster)

            # Now position the nodes of each layer such that no edges cross
            ypositions = [np.arange(n_nodes[0])]
            for t in range(len(edges)):
                pos = np.full(n_nodes[t + 1], -1)
                for i in range(pos.shape[0]):
                    prev = edges[t][i]
                    if (prev) >= 0:
                        pos[i] = ypositions[t][prev]
                ordering = np.argsort(pos)
                mapping = dict(zip(ordering, range(len(ordering))))
                ypositions.append(
                    np.array([mapping[i] for i in range(len(ordering))]))
            # Make the positions proportional to the number of cells (cumulative)
            max_pos = 0
            for i, pos in enumerate(ypositions):
                with loompy.connect(os.path.join(tempfolder,
                                                 slice_names[i])) as ds0:
                    n_clusters = ds0.ca[clusters].max() + 1
                    ncells = np.array([(ds0.ca[clusters] == i).sum()
                                       for i in range(n_clusters)])
                    total = 0
                    new_pos = np.zeros_like(pos)
                    for j in range(len(pos)):
                        cluster = np.where(pos == j)[0]
                        new_pos[cluster] = total + ncells[cluster] / 2
                        total += ncells[cluster]
                ypositions[i] = new_pos / 1000
                max_pos = max(max_pos, max(ypositions[i]))

            for i, pos in enumerate(ypositions):
                ypositions[i] += (max_pos - np.max(pos)) / 2

            # Then position the layers properly in time
            xpositions = []
            for i in range(n_layers):
                with loompy.connect(os.path.join(tempfolder,
                                                 slice_names[i])) as ds0:
                    xpositions.append(np.mean(ds0.ca.PseudoAge))

            # Now project each individual cell to the pseudolineage
            logging.info("Projecting cells to pseudolineage")
            cell_to_xy = {}
            for t in range(len(n_nodes) - 1):
                with loompy.connect(os.path.join(tempfolder,
                                                 slice_names[t])) as ds0:
                    with loompy.connect(
                            os.path.join(tempfolder,
                                         slice_names[t + 1])) as ds1:
                        for i in range(n_nodes[t + 1]):
                            if edges[t][i] != -1:
                                y1 = ypositions[t][edges[t][i]]
                                y2 = ypositions[t + 1][i]
                                offset = (xpositions[t + 1] -
                                          xpositions[t]) / 4
                                overlapping_cells = (ds1.ca[clusters] == i) & (
                                    ds1.ca.PseudoAge < slices[t + 2])
                                crs = np.array(
                                    CatmullRomSpline(
                                        n_points=100).fit_transform(
                                            np.array(
                                                [[slices[t + 1] - offset, y1],
                                                 [slices[t + 1], y1],
                                                 [slices[t + 2], y2],
                                                 [slices[t + 2] + offset,
                                                  y2]])))
                                widths = np.linspace(n_cells[t][edges[t][i]],
                                                     n_cells[t + 1][i],
                                                     num=100) / 1500
                                f = interp1d(crs[:, 0],
                                             crs[:, 1],
                                             fill_value="extrapolate")
                                fw = interp1d(crs[:, 0],
                                              widths,
                                              fill_value="extrapolate")
                                y = f(
                                    ds1.ca.PseudoAge[overlapping_cells]
                                ) + np.random.normal(
                                    scale=fw(
                                        ds1.ca.PseudoAge[overlapping_cells]) /
                                    6,
                                    size=overlapping_cells.sum())
                                for i, ix in enumerate(
                                        np.where(overlapping_cells)[0]):
                                    cell_to_xy[ds1.ca.CellID[ix]] = [
                                        ds1.ca.PseudoAge[ix], y[i]
                                    ]
                        # Draw the leftmost pseudoage slice
                        if t == 0:
                            for i in range(n_nodes[0]):
                                y1 = ypositions[0][i]
                                y2 = ypositions[0][i]
                                widths = np.linspace(n_cells[t][i],
                                                     n_cells[t][i],
                                                     num=100) / 1500
                                overlapping_cells = (ds0.ca[clusters] == i) & (
                                    ds0.ca.PseudoAge < slices[1])
                                y = y1 + np.random.normal(
                                    scale=widths[0] / 6,
                                    size=overlapping_cells.sum())
                                for i, ix in enumerate(
                                        np.where(overlapping_cells)[0]):
                                    cell_to_xy[ds1.ca.CellID[ix]] = [
                                        ds0.ca.PseudoAge[ix], y[i]
                                    ]
                        # Draw the rightmost pseudoage slice
                        if t == len(n_nodes) - 2:
                            for i in range(n_nodes[-1]):
                                y1 = ypositions[t][edges[t][i]]
                                y2 = ypositions[t + 1][i]
                                widths = np.linspace(n_cells[t][edges[t][i]],
                                                     n_cells[t + 1][i],
                                                     num=100) / 1500
                                overlapping_cells = (ds1.ca[clusters] == i) & (
                                    ds1.ca.PseudoAge > slices[-2])
                                y = y2 + np.random.normal(
                                    scale=widths[-1] / 6,
                                    size=overlapping_cells.sum())
                                for i, ix in enumerate(
                                        np.where(overlapping_cells)[0]):
                                    cell_to_xy[ds1.ca.CellID[ix]] = [
                                        ds1.ca.PseudoAge[ix], y[i]
                                    ]

            logging.info(
                "Saving pseudolineage projection back in original file")
            logging.info(ds.ca)
            return cell_to_xy
            xy = np.zeros((ds.shape[1], 2))
            for i, cellid in enumerate(cell_to_xy.keys()):
                j = np.where(ds.ca.CellID == cellid)[0]
                xy[j] = cell_to_xy[cellid]
            ds.ca.PseudoLineage = xy
示例#11
0
def aggregate_loom(ds: loompy.LoomConnection,
                   out_file: str,
                   select: np.ndarray,
                   group_by: str,
                   aggr_by: str,
                   aggr_ca_by: Dict[str, str],
                   return_matrix: bool = False) -> np.ndarray:
    """
	Aggregate a Loom file by applying aggregation functions to the main matrix as well as to the column attributes

	Args:
		ds			The Loom file
		out_file	The name of the output Loom file (will be appended to if it exists)
		select		Bool array giving the columns to include (or None, to include all)
		group_by	The column attribute to group by
		aggr_by 	The aggregation function for the main matrix
		aggr_ca_by	The aggregation functions for the column attributes (or None to skip)

	Remarks:
		aggr_by gives the aggregation function for the main matrix
		aggr_ca_by is a dictionary with column attributes as keys and aggregation functionas as values
		
		Aggregation functions can be any valid aggregation function from here: https://github.com/ml31415/numpy-groupies

		In addition, you can specify:
			"tally" to count the number of occurences of each value of a categorical attribute
	"""
    ca = {}  # type: Dict[str, np.ndarray]
    if select is not None:
        raise ValueError("The 'select' argument is deprecated")
    labels = (ds.ca[group_by]).astype('int')
    _, zero_strt_sort_noholes_lbls = np.unique(labels, return_inverse=True)
    n_groups = len(set(labels))
    if aggr_ca_by is not None:
        for key in ds.col_attrs.keys():
            if key not in aggr_ca_by:
                continue
            func = aggr_ca_by[key]
            if func == "tally":
                for val in set(ds.col_attrs[key]):
                    ca[key + "_" + val] = npg.aggregate(
                        zero_strt_sort_noholes_lbls,
                        (ds.col_attrs[key] == val).astype('int'),
                        func="sum",
                        fill_value=0)
            elif func == "mode":

                def mode(x):
                    return scipy.stats.mode(x)[0][0]

                ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls,
                                        ds.col_attrs[key],
                                        func=mode,
                                        fill_value=0).astype('str')
            elif func == "mean":
                ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls,
                                        ds.col_attrs[key],
                                        func=func,
                                        fill_value=0)
            elif func == "first":
                ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls,
                                        ds.col_attrs[key],
                                        func=func,
                                        fill_value=ds.col_attrs[key][0])

    m = np.empty((ds.shape[0], n_groups))
    for (_, selection, view) in ds.scan(axis=0):
        vals_aggr = npg.aggregate(zero_strt_sort_noholes_lbls,
                                  view[:, :],
                                  func=aggr_by,
                                  axis=1,
                                  fill_value=0)
        m[selection, :] = vals_aggr

    if return_matrix:
        return m

    loompy.create_append(out_file, m, ds.ra, ca, fill_values="auto")