示例#1
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            with loompy.connect(self.input().fn, mode="r") as ds:
                logging.info("Collecting valid cells")
                for (ix, selection, view) in ds.scan(
                        items=np.where(ds.col_attrs["_Valid"] == 1)[0],
                        axis=1,
                        key="Accession"):
                    loompy.create_append(out_file, view.layers, view.ra,
                                         view.ca)

            with loompy.connect(out_file) as ds:
                logging.info(f"Found {ds.shape[1]} valid cells")
                logging.info("Learning the manifold")
                cg.Cytograph2(accel=self.accel,
                              log=self.log,
                              normalize=self.normalize,
                              a=self.a,
                              b=self.b,
                              c=self.c,
                              d=self.d,
                              k=self.k,
                              k_smoothing=self.k_smoothing,
                              n_factors=self.n_factors,
                              max_iter=200).fit(ds)
示例#2
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            with loompy.connect(self.input().fn) as ds:
                for (ix, selection, view) in ds.scan(
                        items=np.where(ds.col_attrs["_Valid"] == 1)[0],
                        axis=1,
                        key="Accession"):
                    loompy.create_append(out_file, view.layers, view.ra,
                                         view.ca)

            with loompy.connect(out_file) as ds:
                logging.info("Learning the manifold")
                ml = cg.ManifoldLearning2(
                    n_genes=self.n_genes,
                    gtsne=self.gtsne,
                    alpha=self.alpha,
                    filter_cellcycle=self.filter_cellcycle,
                    layer=self.layer)
                (knn, mknn, tsne) = ml.fit(ds)
                ds.col_graphs.KNN = knn
                ds.col_graphs.MKNN = mknn
                ds.ca._X = tsne[:, 0]
                ds.ca._Y = tsne[:, 1]

                logging.info("Clustering on the manifold L1")
                pl = cg.PolishedLouvain()
                labels = pl.fit_predict(ds)
                ds.ca.Clusters = labels + 1
                ds.ca.Outliers = (labels == -1).astype('int')
                logging.info(f"Found {labels.max() + 1} clusters")
示例#3
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            with loompy.connect(self.input().fn) as ds:
                cells = np.where(
                    ds.ca[f"TaxonomyRank{self.rank}"] == self.taxon)[0]
                if cells.sum() == 0:
                    raise ValueError(f"No cells found in taxon {self.taxon}!")
                for (ix, selection, view) in ds.scan(items=cells,
                                                     axis=1,
                                                     key="Accession"):
                    loompy.create_append(out_file, view.layers, view.ra,
                                         view.ca)
            logging.info("Renumbering the clusters")
            with loompy.connect(out_file) as dsout:
                # Renumber the clusters
                dsout.ca.Clusters = LabelEncoder().fit_transform(
                    dsout.ca.Clusters)

                logging.info("Recomputing the list of valid genes")
                nnz = dsout.map([np.count_nonzero], axis=0)[0]
                valid_genes = np.logical_and(nnz > 10,
                                             nnz < dsout.shape[1] * 0.6)
                dsout.ra._Valid = valid_genes.astype('int')

                logging.info("Learning the manifold")
                ml = cg.ManifoldLearning2(gtsne=True, alpha=1)
                (knn, mknn, tsne) = ml.fit(dsout)
                dsout.col_graphs.KNN = knn
                dsout.col_graphs.MKNN = mknn
                dsout.ca._X = tsne[:, 0]
                dsout.ca._Y = tsne[:, 1]
示例#4
0
 def run(self) -> None:
     logging = cg.logging(self)
     with self.output().temporary_path() as out_file:
         copyfile(self.input().fn, out_file)
         with loompy.connect(out_file) as ds:
             labels = ds.ca[f"TaxonomyRank{self.rank}"]
             le = LabelEncoder()
             new_clusters = le.fit_transform(labels)
             ds.ca.Clusters = new_clusters
示例#5
0
    def run(self) -> None:
        logging = cg.logging(self, True)
        with self.output().temporary_path() as out_dir:
            logging.info("Exporting cluster data")
            if not os.path.exists(out_dir):
                os.mkdir(out_dir)
            with loompy.connect(self.input()[0].fn) as dsagg:
                dsagg.export(
                    os.path.join(out_dir, f"L6_R{self.rank}_expression.tab"))
                dsagg.export(os.path.join(out_dir,
                                          f"L6_R{self.rank}_enrichment.tab"),
                             layer="enrichment")
                dsagg.export(os.path.join(out_dir,
                                          f"L6_R{self.rank}_enrichment_q.tab"),
                             layer="enrichment_q")
                dsagg.export(os.path.join(out_dir,
                                          f"L6_R{self.rank}_trinaries.tab"),
                             layer="trinaries")

                logging.info("Plotting manifold graph with auto-annotation")
                with loompy.connect(self.input()[1].fn) as ds:
                    cg.plot_graph(
                        ds,
                        os.path.join(out_dir,
                                     f"L6_R{self.rank}_manifold.aa.png"),
                        list(dsagg.ca.AutoAnnotation))

                    logging.info(
                        "Plotting manifold graph with auto-auto-annotation")
                    cg.plot_graph(
                        ds,
                        os.path.join(out_dir,
                                     f"L6_R{self.rank}_manifold.aaa.png"),
                        list(dsagg.ca.MarkerGenes))

                    logging.info("Plotting manifold graph with taxon names")
                    cg.plot_graph(
                        ds,
                        os.path.join(out_dir,
                                     f"L6_R{self.rank}_manifold.names.png"),
                        list(dsagg.ca[f"TaxonomyRank{self.rank}"]))

                    logging.info("Plotting marker heatmap")
                    cg.plot_markerheatmap(ds,
                                          dsagg,
                                          n_markers_per_cluster=self.n_markers,
                                          out_file=os.path.join(
                                              out_dir,
                                              f"L6_R{self.rank}_heatmap.pdf"))
示例#6
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            with loompy.connect(self.input().fn) as ds:
                cg.Aggregator(self.n_markers).aggregate(ds, out_file)
            with loompy.connect(out_file) as dsagg:
                for ix, score in enumerate(dsagg.col_attrs["ClusterScore"]):
                    logging.info(f"Cluster {ix} score {score:.1f}")

                logging.info("Computing auto-annotation")
                aa = cg.AutoAnnotator(root=am.paths().autoannotation)
                aa.annotate_loom(dsagg)
                aa.save_in_loom(dsagg)

                logging.info("Computing auto-auto-annotation")
                n_clusters = dsagg.shape[1]
                (selected, selectivity, specificity,
                 robustness) = cg.AutoAutoAnnotator(
                     n_genes=self.n_auto_genes).fit(dsagg)
                dsagg.set_attr("MarkerGenes",
                               np.array([
                                   " ".join(ds.ra.Gene[selected[:, ix]])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                np.set_printoptions(precision=1, suppress=True)
                dsagg.set_attr("MarkerSelectivity",
                               np.array([
                                   str(selectivity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerSpecificity",
                               np.array([
                                   str(specificity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerRobustness",
                               np.array([
                                   str(robustness[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
示例#7
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            for clustered in self.input():
                with loompy.connect(clustered.fn, "r") as ds:
                    logging.info("Split/pool from " + clustered.fn)
                    cells = np.where(ds.ca.Class == self.major_class)[0]
                    if self.major_class == "Oligos":
                        # Special selection of cells for the oligo class, to balance between tissues
                        enough_genes = ds.map(
                            (np.count_nonzero, ), axis=1)[0] > 1000
                        has_pdgfra = ds[ds.ra.Gene == "Pdgfra", :][0] > 0
                        has_meg3 = ds[ds.ra.Gene == "Meg3", :][0] > 0
                        is_doublet = np.zeros(ds.shape[1], dtype='bool')
                        for g in [
                                'Stmn2', 'Aqp4', 'Gja1', 'C1qc', 'Aif1',
                                'Cldn5', 'Fn1', 'Hbb-bt', 'Hbb-bh1', 'Hbb-bh2',
                                'Hbb-y', 'Hbb-bs', 'Hba-a1', 'Hba-a2', 'Hba-x'
                        ]:
                            is_doublet = np.logical_or(
                                is_doublet, ds[ds.ra.Gene == g, :][0] > 0)
                        ok_cells = enough_genes & (~is_doublet) & (has_pdgfra
                                                                   | ~has_meg3)
                        cells = np.intersect1d(cells, np.where(ok_cells)[0])
                        if cells.shape[0] > 5000:
                            cells = np.random.choice(cells, 5000, False)

                        for (_, _, view) in ds.scan(items=cells,
                                                    axis=1,
                                                    key="Accession"):
                            loompy.create_append(out_file, view.layers,
                                                 view.ra, view.ca)
                    else:
                        for (_, _, view) in ds.scan(items=cells,
                                                    axis=1,
                                                    key="Accession"):
                            loompy.create_append(out_file, view.layers,
                                                 view.ra, view.ca)

            with loompy.connect(out_file) as ds:
                logging.info(f"Found {ds.shape[1]} valid cells")
                logging.info("Learning the manifold")
                cg.Cytograph2(max_iter=100).fit(ds)
示例#8
0
	def run(self) -> None:
		logging = cg.logging(self, True)
		logging.info("Exporting cluster data")
		with self.output().temporary_path() as out_dir:
			if not os.path.exists(out_dir):
				os.mkdir(out_dir)
			with loompy.connect(self.input()[0].fn) as dsagg:
				logging.info("Computing auto-annotation")
				aa = cg.AutoAnnotator(root=am.paths().autoannotation)
				aa.annotate_loom(dsagg)
				aa.save_in_loom(dsagg)

				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_expression.tab"))
				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_enrichment.tab"), layer="enrichment")
				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_enrichment_q.tab"), layer="enrichment_q")
				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_trinaries.tab"), layer="trinaries")

				ds = loompy.connect(self.input()[1].fn)

				logging.info("Plotting MKNN graph")
				cg.plot_knn(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.mknn.png"))

				# logging.info("Plotting Louvain resolution")
				# cg.plot_louvain(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.louvain.png"))

				try:
					logging.info("Plotting manifold graph with classes")
					cg.plot_classes(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.classes.png"))
				except Exception:
					pass

				logging.info("Plotting manifold graph with auto-annotation")
				tags = list(dsagg.col_attrs["AutoAnnotation"])
				cg.plot_graph(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.aa.png"), tags)

				logging.info("Plotting manifold graph with auto-auto-annotation")
				tags = list(dsagg.col_attrs["MarkerGenes"])
				cg.plot_graph(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.aaa.png"), tags)

				logging.info("Plotting marker heatmap")
				cg.plot_markerheatmap(ds, dsagg, n_markers_per_cluster=self.n_markers, out_file=os.path.join(out_dir, "L1_" + self.tissue + "_heatmap.pdf"))
示例#9
0
    def run(self) -> None:
        logging = cg.logging(self, True)
        with self.output().temporary_path() as out_dir:
            logging.info("Exporting cluster data")
            if not os.path.exists(out_dir):
                os.mkdir(out_dir)

            with loompy.connect(self.input()[0].fn) as dsagg:
                logging.info("Exporting tab files")
                dsagg.export(
                    os.path.join(
                        out_dir, "L2_" + self.major_class + "_" + self.tissue +
                        "_expression.tab"))
                dsagg.export(os.path.join(
                    out_dir, "L2_" + self.major_class + "_" + self.tissue +
                    "_enrichment.tab"),
                             layer="enrichment")
                dsagg.export(os.path.join(
                    out_dir, "L2_" + self.major_class + "_" + self.tissue +
                    "_enrichment_q.tab"),
                             layer="enrichment_q")
                dsagg.export(os.path.join(
                    out_dir, "L2_" + self.major_class + "_" + self.tissue +
                    "_trinaries.tab"),
                             layer="trinaries")

                logging.info("Plotting manifold graph with auto-annotation")
                tags = list(dsagg.col_attrs["AutoAnnotation"])
                with loompy.connect(self.input()[1].fn) as ds:
                    cg.plot_graph(
                        ds,
                        os.path.join(
                            out_dir, "L2_" + self.major_class + "_" +
                            self.tissue + "_manifold.aa.png"), tags)

                    logging.info(
                        "Plotting manifold graph with auto-auto-annotation")
                    tags = list(dsagg.col_attrs["MarkerGenes"][np.argsort(
                        dsagg.col_attrs["Clusters"])])
                    cg.plot_graph(
                        ds,
                        os.path.join(
                            out_dir, "L2_" + self.major_class + "_" +
                            self.tissue + "_manifold.aaa.png"), tags)

                    logging.info("Plotting manifold graph with classes")
                    cg.plot_classes(
                        ds,
                        os.path.join(
                            out_dir, "L2_" + self.major_class + "_" +
                            self.tissue + "_manifold.classes.png"))

                    logging.info("Plotting marker heatmap")
                    cg.plot_markerheatmap(ds,
                                          dsagg,
                                          n_markers_per_cluster=10,
                                          out_file=os.path.join(
                                              out_dir,
                                              "L2_" + self.major_class + "_" +
                                              self.tissue + "_heatmap.pdf"))

                    logging.info("Plotting latent factors")
                    cg.plot_factors(ds,
                                    base_name=os.path.join(
                                        out_dir, "L2_" + self.major_class +
                                        "_" + self.tissue + "_factors"))
示例#10
0
    def run(self) -> None:
        logging = cg.logging(self)
        samples = [x.fn for x in self.input()]
        max_cluster_id = 0
        cluster_ids: List[int] = []
        original_ids: List[int] = []
        samples_per_cell: List[str] = []

        celltypes_summary_file = os.path.join(
            am.paths().build, "curated_L4",
            "celltypes_summary_leaforder16-Dec-2017.xlsx")
        celltypes_summary = pd.read_excel(celltypes_summary_file)
        celltypes_dict = {
            celltypes_summary.columns.values[i]: celltypes_summary.values[:, i]
            for i in range(celltypes_summary.shape[1])
        }

        with self.output().temporary_path() as out_file:
            accessions = None  # type: np.ndarray
            for sample in samples:
                with loompy.connect(sample) as ds:
                    logging.info(f"Adding {ds.shape[1]} cells from {sample}")
                    target = os.path.basename(sample)[3:-5]
                    not_excluded = celltypes_dict["OriginalCluster"][
                        celltypes_dict["Bucket"] == target]
                    cells = np.where(np.isin(ds.ca.Clusters, not_excluded))[0]
                    for (ix, selection, view) in ds.scan(items=cells,
                                                         axis=1,
                                                         key="Accession"):
                        cluster_ids += list(view.ca.Clusters + max_cluster_id)
                        original_ids += list(view.ca.Clusters)
                        samples_per_cell += [sample] * selection.shape[0]
                        loompy.create_append(out_file,
                                             view.layers,
                                             view.ra,
                                             view.ca,
                                             fill_values="auto")
                    max_cluster_id = max(cluster_ids) + 1
                    logging.info(f"Found {max_cluster_id} clusters total")
            with loompy.connect(out_file) as ds:
                ds.ca.Clusters = np.array(cluster_ids)
                ds.ca.OriginalClusters = np.array(original_ids)
                ds.ca.Bucket = np.array(samples_per_cell)

                leaf_order = np.zeros(ds.shape[1], dtype='int') - 1
                le = LabelEncoder()
                le.fit(celltypes_dict["ClusterName"])
                new_clusters = np.zeros(ds.shape[1], dtype='int') - 1
                d = {}
                for attr in [
                        "LeafOrder", "Probable_location",
                        "Developmental_compartment", "Region", "Description",
                        "Location_based_on", "Neurotransmitter", "ClusterName",
                        "Taxonomy_group", "Comment", "ClusterName"
                ]:
                    d[attr] = np.array([""] * ds.shape[1], dtype=object)

                for ix in range(len(celltypes_dict["Bucket"])):
                    bucket = celltypes_dict["Bucket"][ix]
                    bucket_name = f"/Users/sten/build_20171205/L4_{bucket}.loom"
                    original_cluster = celltypes_dict["OriginalCluster"][ix]
                    cells = np.logical_and(
                        ds.ca.Bucket == bucket_name,
                        ds.ca.OriginalClusters == original_cluster)
                    leaf_order[cells] = celltypes_dict["LeafOrder"][ix]
                    new_clusters[cells] = le.transform(
                        [celltypes_dict["ClusterName"][ix]])
                    for attr in d.keys():
                        d[attr][cells] = celltypes_dict[attr][ix]

                logging.info(f"Found {new_clusters.max() + 1} clusters total")
                ds.ca.Clusters = new_clusters
                ds.ca.LeafOrder = leaf_order
                for key, vals in d.items():
                    ds.ca[key] = vals.astype("unicode")

                taxonomy_file = os.path.join(am.paths().build, "curated_L4",
                                             "Taxonomy.xlsx")
                taxonomy_table = pd.read_excel(taxonomy_file)
                taxonomy = {
                    taxonomy_table.values[i, 3]: taxonomy_table.values[i, :]
                    for i in range(taxonomy_table.shape[0])
                }

                tax1 = np.array([""] * ds.shape[1], dtype=object)
                tax2 = np.array([""] * ds.shape[1], dtype=object)
                tax3 = np.array([""] * ds.shape[1], dtype=object)
                tax4 = np.array([""] * ds.shape[1], dtype=object)
                taxs = np.array([""] * ds.shape[1], dtype=object)

                for i in range(ds.shape[1]):
                    if ds.ca.Clusters[i] == -1:
                        continue
                    tax1[i] = taxonomy[d["Taxonomy_group"][i]][0]
                    tax2[i] = taxonomy[d["Taxonomy_group"][i]][1]
                    tax3[i] = taxonomy[d["Taxonomy_group"][i]][2]
                    tax4[i] = taxonomy[d["Taxonomy_group"][i]][3]
                    taxs[i] = taxonomy[d["Taxonomy_group"][i]][4]
                ds.ca.TaxonomyRank1 = tax1
                ds.ca.TaxonomyRank2 = tax2
                ds.ca.TaxonomyRank3 = tax3
                ds.ca.TaxonomyRank4 = tax4
                ds.ca.TaxonomySymbol = taxs

                logging.info("Recomputing the list of valid genes")
                nnz = ds.map([np.count_nonzero], axis=0)[0]
                valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6)
                ds.ra._Valid = valid_genes.astype('int')

                logging.info("Learning the manifold")
                ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000)
                (knn, mknn, tsne) = ml.fit(ds)
                ds.col_graphs.KNN = knn
                ds.col_graphs.MKNN = mknn
                ds.ca._X = tsne[:, 0]
                ds.ca._Y = tsne[:, 1]
示例#11
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            ds = loompy.connect(self.input().fn)
            spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean",
                "Bucket": "mode",
                "Region": "first",
                "OriginalClusters": "first",
                "Probable_location": "first",
                "Developmental_compartment": "first",
                "Description": "first",
                "Location_based_on": "first",
                "Neurotransmitter": "first",
                "LeafOrder": "first",
                "Comment": "first",
                "ClusterName": "first",
                "TaxonomyRank1": "first",
                "TaxonomyRank2": "first",
                "TaxonomyRank3": "first",
                "TaxonomyRank4": "first",
                "TaxonomySymbol": "first"
            }
            cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec)
            dsagg = loompy.connect(out_file)

            logging.info("Computing auto-annotation")
            aa = cg.AutoAnnotator(root=am.paths().autoannotation)
            aa.annotate_loom(dsagg)
            aa.save_in_loom(dsagg)

            logging.info("Computing auto-auto-annotation")
            n_clusters = dsagg.shape[1]
            (selected, selectivity, specificity,
             robustness) = cg.AutoAutoAnnotator(
                 n_genes=self.n_auto_genes).fit(dsagg)
            dsagg.set_attr("MarkerGenes",
                           np.array([
                               " ".join(ds.ra.Gene[selected[:, ix]])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            np.set_printoptions(precision=1, suppress=True)
            dsagg.set_attr("MarkerSelectivity",
                           np.array([
                               str(selectivity[:, ix])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            dsagg.set_attr("MarkerSpecificity",
                           np.array([
                               str(specificity[:, ix])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            dsagg.set_attr("MarkerRobustness",
                           np.array([
                               str(robustness[:, ix])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            dsagg.close()
示例#12
0
    def run(self) -> None:
        logging = cg.logging(self, True)
        with self.output().temporary_path() as out_dir:
            logging.info("Exporting cluster data")
            if not os.path.exists(out_dir):
                os.mkdir(out_dir)
            with loompy.connect(self.input()[0].fn) as dsagg:
                with open(
                        os.path.join(out_dir,
                                     "L5_All_taxon_enrichment_0.2.txt"),
                        'w') as f:
                    logging.info("Computing taxon enrichment")
                    for rank in [1, 2, 3, 4]:
                        taxa = list(set(dsagg.ca[f"TaxonomyRank{rank}"]))
                        for taxon in taxa:
                            gix = np.where(
                                np.all(
                                    dsagg["trinaries"]
                                    [:,
                                     dsagg.ca[f"TaxonomyRank{rank}"] == taxon]
                                    > 0.999,
                                    axis=1))[0]
                            non_group_mean = np.mean(
                                dsagg["trinaries"][gix, :]
                                [:, dsagg.ca[f"TaxonomyRank{rank}"] != taxon],
                                axis=1)
                            genes = dsagg.ra.Gene[gix[np.argsort(
                                non_group_mean)]][0:20]
                            f.write(
                                str(rank) + " " + taxon + "\t" +
                                "\t".join(genes) + "\n")
                with open(
                        os.path.join(out_dir,
                                     "L5_All_taxon_enrichment_0.05.txt"),
                        'w') as f:
                    logging.info("Computing taxon enrichment")
                    for rank in [1, 2, 3, 4]:
                        taxa = list(set(dsagg.ca[f"TaxonomyRank{rank}"]))
                        for taxon in taxa:
                            gix = np.where(
                                np.all(
                                    dsagg["trinaries_0.05"]
                                    [:,
                                     dsagg.ca[f"TaxonomyRank{rank}"] == taxon]
                                    > 0.999,
                                    axis=1))[0]
                            non_group_mean = np.mean(
                                dsagg["trinaries_0.05"][gix, :]
                                [:, dsagg.ca[f"TaxonomyRank{rank}"] != taxon],
                                axis=1)
                            genes = dsagg.ra.Gene[gix[np.argsort(
                                non_group_mean)]][0:20]
                            f.write(
                                str(rank) + " " + taxon + "\t" +
                                "\t".join(genes) + "\n")

                dsagg.export(os.path.join(out_dir, "L5_All_expression.tab"))
                dsagg.export(os.path.join(out_dir, "L5_All_enrichment.tab"),
                             layer="enrichment")
                dsagg.export(os.path.join(out_dir, "L5_All_enrichment_q.tab"),
                             layer="enrichment_q")
                dsagg.export(os.path.join(out_dir, "L5_All_trinaries.tab"),
                             layer="trinaries")

            logging.info("Plotting all cells t-SNE")
            with loompy.connect(os.path.join(out_dir,
                                             self.input()[1].fn)) as ds:
                fig = plt.figure(figsize=(3, 3))
                ax = fig.add_axes([0, 0, 1, 1])
                ax.axis('off')
                colors = cg.colorize(np.arange(52))
                ix = 0
                for taxon in np.unique(ds.ca.TaxonomyRank3):
                    cells = ds.ca.TaxonomyRank3 == taxon
                    plt.scatter(x=ds.ca._X[cells],
                                y=ds.ca._Y[cells],
                                s=10,
                                c=colors[ix, :],
                                marker='.',
                                label=taxon,
                                alpha=0.3,
                                lw=0)
                    ix += 1
                    lgnd = ax.legend(fontsize=10,
                                     labelspacing=0.2,
                                     loc="upper left",
                                     bbox_to_anchor=(1, 1),
                                     frameon=False)
                    for handle in lgnd.legendHandles:
                        handle.set_sizes([250])
                        handle.set_alpha(1)
                plt.savefig(os.path.join(out_dir, "L5_All.png"),
                            dpi=600,
                            transparent=True,
                            bbox_extra_artists=(lgnd, ),
                            bbox_inches='tight')
                plt.close()
示例#13
0
    def run(self) -> None:
        logging = cg.logging(self, True)
        dsout: loompy.LoomConnection = None
        accessions: loompy.LoomConnection = None
        with self.output().temporary_path() as out_file:
            logging.info("Gathering cells for " + self.target)
            enriched_markers: List[np.ndarray] = [
            ]  # The enrichment vector for each selected cluster
            cells_found = False
            for in_file, agg_file in self.input():
                tissue = os.path.basename(
                    in_file.fn).split("_")[2].split(".")[0]
                ds = loompy.connect(in_file.fn)
                dsagg = loompy.connect(agg_file.fn)
                enrichment = dsagg.layer["enrichment"][:, :]
                labels = ds.col_attrs["Clusters"]
                ordering: np.ndarray = None
                logging.info(tissue)

                # Figure out which cells should be collected
                cells: List[int] = []
                for fname in os.listdir(
                        os.path.join(am.paths().build, "curated_L2")):
                    if not fname.startswith("L2"):
                        continue
                    from_tissue = fname.split("_")[2]
                    if from_tissue != tissue:
                        continue
                    if tissue == "All":
                        major_class = fname.split("_")[1]
                        if major_class != self.target:
                            continue
                    logging.info("Gathering cells from " + in_file.fn)
                    logging.info("Gathering cells based on " + fname)
                    with open(
                            os.path.join(am.paths().build, "curated_L2",
                                         fname)) as f:
                        schedule = [x[:-1].split("\t") for x in f.readlines()]
                        for (cluster_str, n_cells, auto_target, curated_target,
                             comment) in schedule:
                            cluster = int(cluster_str)
                            if curated_target == self.target:
                                if accessions is None:
                                    accessions = ds.row_attrs["Accession"]
                                if ordering is None:
                                    ordering = np.where(
                                        ds.row_attrs["Accession"][
                                            None, :] == accessions[:, None])[1]
                                cells += list(np.where(labels == cluster)[0])
                                enriched_markers.append(
                                    np.argsort(-enrichment[:,
                                                           cluster][ordering]))

                if len(cells) > 0:
                    cells = np.sort(np.array(cells))
                    cells_found = True
                    for (ix, selection, view) in ds.scan(items=cells,
                                                         axis=1,
                                                         key="Accession"):
                        loompy.create_append(out_file, view.layers, view.ra,
                                             view.ca)

            if not cells_found:
                raise ValueError(
                    f"No cells matched any schedule for {self.target}")

            # Figure out which enriched markers to use
            ix = 0
            temp: List[int] = []
            while len(temp) < self.n_enriched:
                for j in range(len(enriched_markers)):
                    if enriched_markers[j][ix] not in temp:
                        temp.append(enriched_markers[j][ix])
                ix += 1
            genes = np.sort(np.array(temp))

            logging.info("Learning the manifold")
            with loompy.connect(out_file) as dsout:
                ml = cg.ManifoldLearning2(gtsne=True, alpha=1, genes=genes)
                (knn, mknn, tsne) = ml.fit(dsout)
                dsout.col_graphs.KNN = knn
                dsout.col_graphs.MKNN = mknn
                dsout.ca._X = tsne[:, 0]
                dsout.ca._Y = tsne[:, 1]

                logging.info("Clustering on the manifold")

                special_res = {
                    "Astrocytes": 0.6,
                    "Sensory_Neurons": 0.35,
                    "Brain_Granule": 0.6
                }
                r = 1.0
                if self.target in special_res:
                    r = special_res[self.target]

                pl = cg.PolishedLouvain(resolution=r)
                labels = pl.fit_predict(dsout)
                dsout.ca.Clusters = labels + 1
                dsout.ca.Outliers = (labels == -1).astype('int')
                logging.info(f"Found {labels.max() + 1} clusters")
示例#14
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            with loompy.connect(self.input().fn) as ds:
                cg.Aggregator().aggregate(ds, out_file)
                with loompy.connect(out_file) as dsagg:
                    for ix, score in enumerate(
                            dsagg.col_attrs["ClusterScore"]):
                        logging.info(f"Cluster {ix} score {score:.1f}")

                    logging.info("Computing auto-annotation")
                    aa = cg.AutoAnnotator(root=am.paths().autoannotation)
                    aa.annotate_loom(dsagg)
                    aa.save_in_loom(dsagg)

                    logging.info("Computing auto-auto-annotation")
                    n_clusters = dsagg.shape[1]
                    (selected, selectivity, specificity,
                     robustness) = cg.AutoAutoAnnotator(
                         n_genes=self.n_auto_genes).fit(dsagg)
                    dsagg.set_attr("MarkerGenes",
                                   np.array([
                                       " ".join(ds.ra.Gene[selected[:, ix]])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)
                    np.set_printoptions(precision=1, suppress=True)
                    dsagg.set_attr("MarkerSelectivity",
                                   np.array([
                                       str(selectivity[:, ix])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)
                    dsagg.set_attr("MarkerSpecificity",
                                   np.array([
                                       str(specificity[:, ix])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)
                    dsagg.set_attr("MarkerRobustness",
                                   np.array([
                                       str(robustness[:, ix])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)

                    tissue = self.tissue
                    labels = ds.col_attrs["Clusters"]

                    if self.tissue is "All":
                        dsagg.ca.Bucket = np.array([self.major_class] *
                                                   dsagg.shape[1])
                    else:
                        # Figure out which cells should be collected
                        cells: List[int] = []
                        # clusters_seen: List[int] = []  # Clusters for which there was some schedule
                        clusters_seen: Dict[int, str] = {}
                        schedule = pooling_schedule_L3[self.tissue]

                        # Where to send clusters when no rules match
                        _default_schedule: str = None
                        for aa_tag, sendto in schedule:
                            if aa_tag == "*":
                                _default_schedule = sendto

                        # For each cluster in the tissue
                        bucket_list = []
                        for ix, agg_aa in enumerate(dsagg.ca.AutoAnnotation):
                            # For each rule in the schedule
                            for aa_tag, sendto in schedule:
                                if aa_tag in agg_aa.split(","):
                                    if ix in clusters_seen:
                                        logging.info(
                                            f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto} (overruled by '{clusters_seen[ix]}')"
                                        )
                                    else:
                                        clusters_seen[
                                            ix] = f"{aa_tag} -> {sendto}"
                                        logging.info(
                                            f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto}"
                                        )
                                        bucket_list.append(sendto)
                            if ix not in clusters_seen:
                                if _default_schedule is None:
                                    logging.info(
                                        f"{tissue}/{ix}/{agg_aa}: No matching rule"
                                    )
                                    bucket_list.append("Excluded")
                                else:
                                    clusters_seen[
                                        ix] = f"{aa_tag} -> {_default_schedule}"
                                    logging.info(
                                        f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {_default_schedule}"
                                    )
                                    bucket_list.append(_default_schedule)
                        dsagg.ca.Bucket = np.array(bucket_list)
示例#15
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_dir:
            logging.info("Exporting cluster data")
            if not os.path.exists(out_dir):
                os.mkdir(out_dir)
            dsagg = loompy.connect(self.input()[0].fn)
            logging.info("Computing auto-annotation")
            aa = cg.AutoAnnotator(root=am.paths().autoannotation)
            aa.annotate_loom(dsagg)
            aa.save_in_loom(dsagg)

            dsagg.export(
                os.path.join(out_dir, "L3_" + self.target + "_expression.tab"))
            dsagg.export(os.path.join(out_dir,
                                      "L3_" + self.target + "_enrichment.tab"),
                         layer="enrichment")
            dsagg.export(os.path.join(
                out_dir, "L3_" + self.target + "_enrichment_q.tab"),
                         layer="enrichment_q")
            dsagg.export(os.path.join(out_dir,
                                      "L3_" + self.target + "_trinaries.tab"),
                         layer="trinaries")

            logging.info("Plotting manifold graph with auto-annotation")
            tags = list(dsagg.col_attrs["AutoAnnotation"][np.argsort(
                dsagg.col_attrs["Clusters"])])
            ds = loompy.connect(self.input()[1].fn)
            cg.plot_graph(
                ds,
                os.path.join(out_dir,
                             "L3_" + self.target + "_manifold.aa.png"), tags)

            logging.info("Plotting manifold graph with auto-auto-annotation")
            tags = list(dsagg.col_attrs["MarkerGenes"][np.argsort(
                dsagg.col_attrs["Clusters"])])
            cg.plot_graph(
                ds,
                os.path.join(out_dir,
                             "L3_" + self.target + "_manifold.aaa.png"), tags)

            logging.info("Plotting marker heatmap")
            cg.plot_markerheatmap(ds,
                                  dsagg,
                                  n_markers_per_cluster=self.n_markers,
                                  out_file=os.path.join(
                                      out_dir,
                                      "L3_" + self.target + "_heatmap.pdf"))

            logging.info("Computing discordance distances")
            pep = 0.05
            n_labels = dsagg.shape[1]

            def discordance_distance(a: np.ndarray, b: np.ndarray) -> float:
                """
				Number of genes that are discordant with given PEP, divided by number of clusters
				"""
                return np.sum((1 - a) * b + a * (1 - b) > 1 - pep) / n_labels

            data = dsagg.layer["trinaries"][:n_labels * 10, :].T
            D = squareform(pdist(data, discordance_distance))
            with open(
                    os.path.join(out_dir,
                                 "L3_" + self.target + "_distances.txt"),
                    "w") as f:
                f.write(str(np.diag(D, k=1)))
示例#16
0
    def run(self) -> None:
        logging = cg.logging(self, True)
        with self.output().temporary_path() as out_dir:
            logging.info("Exporting cluster data")
            if not os.path.exists(out_dir):
                os.mkdir(out_dir)
            with loompy.connect(self.input()[0].fn) as dsagg:
                dsagg.export(
                    os.path.join(
                        out_dir,
                        f"L6_R{self.rank}_({self.taxon})_expression.tab"))
                dsagg.export(os.path.join(
                    out_dir, f"L6_R{self.rank}_({self.taxon})_enrichment.tab"),
                             layer="enrichment")
                dsagg.export(os.path.join(
                    out_dir,
                    f"L6_R{self.rank}_({self.taxon})_enrichment_q.tab"),
                             layer="enrichment_q")
                dsagg.export(os.path.join(
                    out_dir, f"L6_R{self.rank}_({self.taxon})_trinaries.tab"),
                             layer="trinaries")

                logging.info("Plotting manifold graph with auto-annotation")
                with loompy.connect(self.input()[1].fn) as ds:
                    cg.plot_graph(
                        ds,
                        os.path.join(
                            out_dir,
                            f"L6_R{self.rank}_({self.taxon})_manifold.aa.png"),
                        list(dsagg.ca.AutoAnnotation))

                    logging.info(
                        "Plotting manifold graph with auto-auto-annotation")
                    cg.plot_graph(
                        ds,
                        os.path.join(
                            out_dir,
                            f"L6_R{self.rank}_({self.taxon})_manifold.aaa.png"
                        ), list(dsagg.ca.MarkerGenes))

                    logging.info("Plotting manifold graph with cluster names")
                    cg.plot_graph(
                        ds,
                        os.path.join(
                            out_dir,
                            f"L6_R{self.rank}_({self.taxon})_manifold.names.png"
                        ), list(dsagg.ca.ClusterName))

                    logging.info("Plotting marker heatmap")
                    cg.plot_markerheatmap(
                        ds,
                        dsagg,
                        n_markers_per_cluster=self.n_markers,
                        out_file=os.path.join(
                            out_dir,
                            f"L6_R{self.rank}_({self.taxon})_heatmap.pdf"))

                    size = 200000 / ds.shape[1]
                    fig = plt.figure(figsize=(3, 3))
                    ax = fig.add_axes([0, 0, 1, 1])
                    ax.axis('off')
                    ix = 0
                    if self.rank == 3:
                        colors = cg.colorize(np.unique(ds.ca.ClusterName))
                        for cluster in np.unique(ds.ca.ClusterName):
                            cells = ds.ca.ClusterName == cluster
                            plt.scatter(x=ds.ca._X[cells],
                                        y=ds.ca._Y[cells],
                                        s=size,
                                        c=colors[ix, :],
                                        marker='.',
                                        label=cluster,
                                        alpha=0.5,
                                        lw=0)
                            ix += 1
                    else:
                        colors = cg.colorize(np.unique(ds.ca.TaxonomyRank4))
                        for taxon4 in np.unique(ds.ca.TaxonomyRank4):
                            cells = ds.ca.TaxonomyRank4 == taxon4
                            plt.scatter(x=ds.ca._X[cells],
                                        y=ds.ca._Y[cells],
                                        s=size,
                                        c=colors[ix, :],
                                        marker='.',
                                        label=taxon4,
                                        alpha=0.5,
                                        lw=0)
                            ix += 1
                    lgnd = ax.legend(fontsize=10,
                                     labelspacing=0.2,
                                     loc="upper left",
                                     bbox_to_anchor=(1, 1),
                                     frameon=False)
                    for handle in lgnd.legendHandles:
                        handle.set_sizes([250])
                        handle.set_alpha(1)
                    plt.savefig(os.path.join(
                        out_dir,
                        f"L6_R{self.rank}_({self.taxon})_manifold.pretty.png"),
                                dpi=600,
                                transparent=True,
                                bbox_extra_artists=(lgnd, ),
                                bbox_inches='tight')
                    plt.close()
示例#17
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            ds = loompy.connect(self.input().fn)
            spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean",
                "Bucket": "mode",
                "Region": "first",
                "OriginalClusters": "first",
                "LeafOrder": "first",
                "Probable_location": "first",
                "Developmental_compartment": "first",
                "Description": "first",
                "Location_based_on": "first",
                "Neurotransmitter": "first",
                "LeafOrder": "first",
                "Comment": "first",
                "ClusterName": "first",
                "TaxonomyRank1": "first",
                "TaxonomyRank2": "first",
                "TaxonomyRank3": "first",
                "TaxonomyRank4": "first",
                "TaxonomySymbol": "first"
            }
            cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec)

            with loompy.connect(out_file) as dsagg:
                logging.info(
                    "Finding non-neuronal, housekeeping, and troublemaking genes"
                )
                (nng, blocked) = _gene_selection_L5(dsagg)

                logging.info("Manifold learning on the aggregate file")
                normalizer = cg.Normalizer(False)
                normalizer.fit(dsagg)
                pca = cg.PCAProjection(np.arange(dsagg.shape[1] * 10),
                                       max_n_components=50)
                pca.fit(dsagg, normalizer)
                transformed = pca.transform(dsagg, normalizer)
                k = 40
                bnn = cg.BalancedKNN(k=k, maxl=2 * k)
                bnn.fit(transformed)
                knn = bnn.kneighbors(mode='connectivity')[1][:, 1:]
                n_cells = knn.shape[0]
                a = np.tile(np.arange(n_cells), k)
                b = np.reshape(knn.T, (n_cells * k, ))
                w = np.repeat(1 / np.power(np.arange(1, k + 1), 1.8), n_cells)
                knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
                threshold = w > 0.025
                mknn = sparse.coo_matrix(
                    (w[threshold], (a[threshold], b[threshold])),
                    shape=(n_cells, n_cells))
                mknn = mknn.minimum(mknn.transpose()).tocoo()
                tsne = cg.TSNE(perplexity=5).layout(transformed)
                dsagg.col_graphs.KNN = knn
                dsagg.col_graphs.MKNN = mknn
                dsagg.ca._X = tsne[:, 0]
                dsagg.ca._Y = tsne[:, 1]

                logging.info("Manifold learning on all cells")
                init = np.zeros((ds.shape[1], 2))
                for lbl in np.unique(ds.ca.Clusters):
                    init[ds.ca.Clusters ==
                         lbl, :] = tsne[lbl, :] + np.random.normal(size=(
                             (ds.ca.Clusters == lbl).sum(), 2))
                ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000)
                (knn, mknn, tsne) = ml.fit(ds,
                                           initial_pos=init,
                                           nng=nng,
                                           blocked_genes=blocked)
                ds.col_graphs.KNN = knn
                ds.col_graphs.MKNN = mknn
                ds.ca._X = tsne[:, 0]
                ds.ca._Y = tsne[:, 1]

                logging.info("Computing auto-annotation")
                aa = cg.AutoAnnotator(root="../auto-annotation/Adolescent/")
                aa.annotate_loom(dsagg)
                aa.save_in_loom(dsagg)

                logging.info("Computing auto-auto-annotation")
                n_clusters = dsagg.shape[1]
                (selected, selectivity, specificity,
                 robustness) = cg.AutoAutoAnnotator(n_genes=6).fit(dsagg)
                dsagg.set_attr("MarkerGenes",
                               np.array([
                                   " ".join(ds.ra.Gene[selected[:, ix]])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                np.set_printoptions(precision=1, suppress=True)
                dsagg.set_attr("MarkerSelectivity",
                               np.array([
                                   str(selectivity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerSpecificity",
                               np.array([
                                   str(specificity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerRobustness",
                               np.array([
                                   str(robustness[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
示例#18
0
    def run(self) -> None:
        logging = cg.logging(self, True)
        with self.output().temporary_path() as out_dir:
            logging.info("Exporting oligo cell types")
            if not os.path.exists(out_dir):
                os.mkdir(out_dir)
            with loompy.connect(self.input().fn) as ds:
                celltypes = ["COP1", "COP2", "NFOL2", "NFOL1", "OPC"]
                selected = np.array([], dtype='int')
                for ct in celltypes:
                    print(ct)
                    cells = np.where(ds.ca.ClusterName == ct)[0]
                    if cells.shape[0] > 820:
                        cells = np.random.choice(cells,
                                                 size=820,
                                                 replace=False)
                    selected = np.union1d(selected, cells)

                ngfile = os.path.join(out_dir, "F_Oligos.loom")
                for (_, _, view) in ds.scan(items=selected, axis=1):
                    loompy.create_append(ngfile, view.layers, view.ra, view.ca)

            with loompy.connect(ngfile) as ds:
                logging.info("Learning the manifold")
                ml = cg.ManifoldLearning2(gtsne=False, alpha=1)
                (knn, mknn, tsne) = ml.fit(ds)
                ds.col_graphs.KNN = knn
                ds.col_graphs.MKNN = mknn
                ds.ca._X = tsne[:, 0]
                ds.ca._Y = tsne[:, 1]

                fig = plt.figure(figsize=(3, 3))
                ax = fig.add_axes([0, 0, 1, 1])
                lc = LineCollection(zip(tsne[mknn.row], tsne[mknn.col]),
                                    linewidths=0.25,
                                    zorder=0,
                                    color='grey',
                                    alpha=0.1)
                ax.add_collection(lc)
                ax.axis('off')
                colors = cg.colorize(np.unique(ds.ca.ClusterName))
                ix = 0
                for ct in np.unique(ds.ca.ClusterName):
                    cells = (ds.ca.ClusterName == ct)
                    plt.scatter(x=ds.ca._X[cells],
                                y=ds.ca._Y[cells],
                                s=40,
                                c=colors[ix, :],
                                marker='.',
                                label=ct,
                                alpha=0.5,
                                lw=0)
                    ix += 1
                    lgnd = ax.legend(fontsize=10,
                                     labelspacing=0.2,
                                     loc="upper left",
                                     bbox_to_anchor=(1, 1),
                                     frameon=False)
                    for handle in lgnd.legendHandles:
                        handle.set_sizes([250])
                        handle.set_alpha(1)
                plt.savefig(os.path.join(out_dir, "Fig_Oligos_Types.png"),
                            dpi=600,
                            transparent=True,
                            bbox_extra_artists=(lgnd, ),
                            bbox_inches='tight')
                plt.close()

                fig = plt.figure(figsize=(3, 3))
                ax = fig.add_axes([0, 0, 1, 1])
                ax.axis('off')
                plt.scatter(x=ds.ca._X,
                            y=ds.ca._Y,
                            s=40,
                            c=cg.colors75[(ds[ds.ra.Gene == "Cdk1", :][0] !=
                                           0).astype('int')],
                            marker='.',
                            label=ct,
                            alpha=0.5,
                            lw=0)
                plt.savefig(os.path.join(out_dir, "Fig_Oligos_Cdk1.png"),
                            dpi=600,
                            transparent=True,
                            bbox_inches='tight')
示例#19
0
    def run(self) -> None:
        logging = cg.logging(self)
        dsout = None  # type: loompy.LoomConnection
        accessions = None  # type: np.ndarray
        with self.output().temporary_path() as out_file:
            for clustered in self.input():
                with loompy.connect(clustered.fn, "r") as ds:
                    logging.info("Split/pool from " + clustered.fn)

                    logging.info("Masking outliers")
                    min_pts = 10
                    eps_pct = 80
                    tsne_pos = np.vstack(
                        (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()

                    # DBSCAN to find outliers
                    nn = NearestNeighbors(n_neighbors=min_pts,
                                          algorithm="ball_tree",
                                          n_jobs=4)
                    nn.fit(tsne_pos)
                    knn = nn.kneighbors_graph(mode='distance')
                    k_radius = knn.max(axis=1).toarray()
                    epsilon = np.percentile(k_radius, eps_pct)

                    clusterer = DBSCAN(eps=epsilon, min_samples=min_pts)
                    labels = clusterer.fit_predict(tsne_pos)

                    # Mask out cells that don't match the class of their local neighbors
                    logging.info("Masking cells in bad neighborhoods")
                    temp = []
                    for ix in range(ds.shape[1]):
                        if labels[ix] == -1:
                            continue
                        if ds.ca.Class[ix] == self.major_class:
                            neighbors = ds.col_graphs.KNN.col[np.where(
                                ds.col_graphs.KNN.row == ix)[0]]
                            neighborhood = ds.ca.Class[
                                neighbors] == self.major_class
                            if neighborhood.sum(
                            ) / neighborhood.shape[0] > 0.2:
                                temp.append(ix)

                    cells = np.array(temp)
                    if self.major_class == "Oligos":
                        # Special selection of cells for the oligo class, to balance between tissues
                        enough_genes = ds.map(
                            (np.count_nonzero, ), axis=1)[0] > 1000
                        has_pdgfra = ds[ds.ra.Gene == "Pdgfra", :][0] > 0
                        has_meg3 = ds[ds.ra.Gene == "Meg3", :][0] > 0
                        is_doublet = np.zeros(ds.shape[1], dtype='bool')
                        for g in [
                                'Stmn2', 'Aqp4', 'Gja1', 'C1qc', 'Aif1',
                                'Cldn5', 'Fn1', 'Hbb-bt', 'Hbb-bh1', 'Hbb-bh2',
                                'Hbb-y', 'Hbb-bs', 'Hba-a1', 'Hba-a2', 'Hba-x'
                        ]:
                            is_doublet = np.logical_or(
                                is_doublet, ds[ds.ra.Gene == g, :][0] > 0)
                        ok_cells = enough_genes & (~is_doublet) & (has_pdgfra
                                                                   | ~has_meg3)
                        cells = np.intersect1d(cells, np.where(ok_cells)[0])
                        if cells.shape[0] > 5000:
                            cells = np.random.choice(cells, 5000, False)

                    for (_, _, view) in ds.scan(items=cells,
                                                axis=1,
                                                key="Accession"):
                        loompy.create_append(out_file, view.layers, view.ra,
                                             view.ca)

            with loompy.connect(out_file) as dsout:
                logging.info("Learning the manifold")
                if self.major_class == "Oligos":
                    ml = cg.ManifoldLearning2(n_genes=self.n_genes,
                                              alpha=self.alpha)
                else:
                    ml = cg.ManifoldLearning2(n_genes=self.n_genes,
                                              gtsne=self.gtsne,
                                              alpha=self.alpha)
                (knn, mknn, tsne) = ml.fit(dsout)
                dsout.col_graphs.KNN = knn
                dsout.col_graphs.MKNN = mknn
                dsout.ca._X = tsne[:, 0]
                dsout.ca._Y = tsne[:, 1]

                logging.info("Clustering on the manifold")
                pl = cg.PolishedLouvain()
                labels = pl.fit_predict(dsout)
                dsout.ca.Clusters = labels + 1
                dsout.ca.Outliers = (labels == -1).astype('int')
                logging.info(f"Found {labels.max() + 1} clusters")