示例#1
0
 def requires(self) -> Iterator[luigi.Task]:
     tissues: List[str] = []
     for fname in os.listdir(os.path.join(am.paths().build, "curated_L2")):
         if not fname.startswith("L2"):
             continue
         tissue = fname.split("_")[2]
         with open(os.path.join(am.paths().build, "curated_L2",
                                fname)) as f:
             schedule = [x[:-1].split("\t") for x in f.readlines()]
             for (cluster, n_cells, auto_target, curated_target,
                  comment) in schedule:
                 if curated_target == self.target:
                     if tissue not in tissues:
                         if tissue == "All":
                             yield [
                                 am.ClusterL2(tissue="All",
                                              major_class=self.target),
                                 am.AggregateL2(tissue="All",
                                                major_class=self.target)
                             ]
                         else:
                             yield [
                                 am.ClusterL2(tissue=tissue,
                                              major_class="Neurons"),
                                 am.AggregateL2(tissue=tissue,
                                                major_class="Neurons")
                             ]
                         tissues.append(tissue)
示例#2
0
 def output(self) -> luigi.LocalTarget:
     if am.paths().use_velocyto:
         fname = os.path.join(am.paths().samples, self.sample, "velocyto",
                              self.sample + ".loom")
         return luigi.LocalTarget(fname)
     else:
         fname = os.path.join(am.paths().samples, self.sample,
                              self.sample + ".loom")
         if os.path.exists(fname):
             return luigi.LocalTarget(fname)
         else:
             fname = os.path.join(am.paths().samples, self.sample + ".loom")
             return luigi.LocalTarget(fname)
示例#3
0
	def requires(self) -> Iterator[luigi.Task]:
		taxonomy_file = os.path.join(am.paths().build, "curated_L4", "Taxonomy.xlsx")
		taxonomy_table = pd.read_excel(taxonomy_file)
		taxonomy = {taxonomy_table.columns.values[i]: taxonomy_table.values[:, i] for i in range(taxonomy_table.shape[1])}
		for taxon in list(set(taxonomy["TaxonomyRank1"])):
			yield am.ExportL6(rank=1, taxon=taxon)
		for taxon in list(set(taxonomy["TaxonomyRank2"])):
			yield am.ExportL6(rank=2, taxon=taxon)
		for taxon in list(set(taxonomy["TaxonomyRank3"])):
			yield am.ExportL6(rank=3, taxon=taxon)
#		for taxon in list(set(taxonomy["TaxonomyRank4"])):
#			yield am.ExportL6(rank=4, taxon=taxon)
		for rank in [1, 2, 3, 4]:
			yield am.ExportByTaxonL6(rank=rank)
		yield am.ExportL5()
    def run(self) -> None:
        with self.output().temporary_path() as fname:
            logging.info("Retraining classifier")
            pathname = os.path.join(am.paths().samples, "classified")
            clf = cg.Classifier(pathname, n_per_cluster=100)
            clf.generate()
            ds_training = loompy.connect(
                os.path.join(pathname, "classified.loom"))
            clf.fit(ds_training)
            with open(fname, "wb") as f:
                pickle.dump(clf, f)

            # Verify that it works (to catch some obscure intermittent UnicodeDecodeError)
            with open(fname, "rb") as f:
                clf = pickle.load(f)
                clf.aggregate_export()
示例#5
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            with loompy.connect(self.input().fn) as ds:
                cg.Aggregator(self.n_markers).aggregate(ds, out_file)
            with loompy.connect(out_file) as dsagg:
                for ix, score in enumerate(dsagg.col_attrs["ClusterScore"]):
                    logging.info(f"Cluster {ix} score {score:.1f}")

                logging.info("Computing auto-annotation")
                aa = cg.AutoAnnotator(root=am.paths().autoannotation)
                aa.annotate_loom(dsagg)
                aa.save_in_loom(dsagg)

                logging.info("Computing auto-auto-annotation")
                n_clusters = dsagg.shape[1]
                (selected, selectivity, specificity,
                 robustness) = cg.AutoAutoAnnotator(
                     n_genes=self.n_auto_genes).fit(dsagg)
                dsagg.set_attr("MarkerGenes",
                               np.array([
                                   " ".join(ds.ra.Gene[selected[:, ix]])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                np.set_printoptions(precision=1, suppress=True)
                dsagg.set_attr("MarkerSelectivity",
                               np.array([
                                   str(selectivity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerSpecificity",
                               np.array([
                                   str(specificity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerRobustness",
                               np.array([
                                   str(robustness[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
示例#6
0
	def run(self) -> None:
		logging = cg.logging(self, True)
		logging.info("Exporting cluster data")
		with self.output().temporary_path() as out_dir:
			if not os.path.exists(out_dir):
				os.mkdir(out_dir)
			with loompy.connect(self.input()[0].fn) as dsagg:
				logging.info("Computing auto-annotation")
				aa = cg.AutoAnnotator(root=am.paths().autoannotation)
				aa.annotate_loom(dsagg)
				aa.save_in_loom(dsagg)

				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_expression.tab"))
				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_enrichment.tab"), layer="enrichment")
				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_enrichment_q.tab"), layer="enrichment_q")
				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_trinaries.tab"), layer="trinaries")

				ds = loompy.connect(self.input()[1].fn)

				logging.info("Plotting MKNN graph")
				cg.plot_knn(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.mknn.png"))

				# logging.info("Plotting Louvain resolution")
				# cg.plot_louvain(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.louvain.png"))

				try:
					logging.info("Plotting manifold graph with classes")
					cg.plot_classes(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.classes.png"))
				except Exception:
					pass

				logging.info("Plotting manifold graph with auto-annotation")
				tags = list(dsagg.col_attrs["AutoAnnotation"])
				cg.plot_graph(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.aa.png"), tags)

				logging.info("Plotting manifold graph with auto-auto-annotation")
				tags = list(dsagg.col_attrs["MarkerGenes"])
				cg.plot_graph(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.aaa.png"), tags)

				logging.info("Plotting marker heatmap")
				cg.plot_markerheatmap(ds, dsagg, n_markers_per_cluster=self.n_markers, out_file=os.path.join(out_dir, "L1_" + self.tissue + "_heatmap.pdf"))
示例#7
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(
         os.path.join(am.paths().build,
                      f"L6_R{self.rank}_({self.taxon}).loom"))
示例#8
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(
         os.path.join(
             am.paths().build,
             f"L1_{self.tissue}_nfactors={self.n_factors}_k={self.k}_ksmoothing={self.k_smoothing}_a={self.a}_b={self.b}_c={self.c}_d={self.d}_log={self.log}_normalize={self.normalize}_accel={self.accel}"
         ))
示例#9
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(
         os.path.join(am.paths().build, "L3_" + self.target + "_exported"))
示例#10
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_dir:
            logging.info("Exporting cluster data")
            if not os.path.exists(out_dir):
                os.mkdir(out_dir)
            dsagg = loompy.connect(self.input()[0].fn)
            logging.info("Computing auto-annotation")
            aa = cg.AutoAnnotator(root=am.paths().autoannotation)
            aa.annotate_loom(dsagg)
            aa.save_in_loom(dsagg)

            dsagg.export(
                os.path.join(out_dir, "L3_" + self.target + "_expression.tab"))
            dsagg.export(os.path.join(out_dir,
                                      "L3_" + self.target + "_enrichment.tab"),
                         layer="enrichment")
            dsagg.export(os.path.join(
                out_dir, "L3_" + self.target + "_enrichment_q.tab"),
                         layer="enrichment_q")
            dsagg.export(os.path.join(out_dir,
                                      "L3_" + self.target + "_trinaries.tab"),
                         layer="trinaries")

            logging.info("Plotting manifold graph with auto-annotation")
            tags = list(dsagg.col_attrs["AutoAnnotation"][np.argsort(
                dsagg.col_attrs["Clusters"])])
            ds = loompy.connect(self.input()[1].fn)
            cg.plot_graph(
                ds,
                os.path.join(out_dir,
                             "L3_" + self.target + "_manifold.aa.png"), tags)

            logging.info("Plotting manifold graph with auto-auto-annotation")
            tags = list(dsagg.col_attrs["MarkerGenes"][np.argsort(
                dsagg.col_attrs["Clusters"])])
            cg.plot_graph(
                ds,
                os.path.join(out_dir,
                             "L3_" + self.target + "_manifold.aaa.png"), tags)

            logging.info("Plotting marker heatmap")
            cg.plot_markerheatmap(ds,
                                  dsagg,
                                  n_markers_per_cluster=self.n_markers,
                                  out_file=os.path.join(
                                      out_dir,
                                      "L3_" + self.target + "_heatmap.pdf"))

            logging.info("Computing discordance distances")
            pep = 0.05
            n_labels = dsagg.shape[1]

            def discordance_distance(a: np.ndarray, b: np.ndarray) -> float:
                """
				Number of genes that are discordant with given PEP, divided by number of clusters
				"""
                return np.sum((1 - a) * b + a * (1 - b) > 1 - pep) / n_labels

            data = dsagg.layer["trinaries"][:n_labels * 10, :].T
            D = squareform(pdist(data, discordance_distance))
            with open(
                    os.path.join(out_dir,
                                 "L3_" + self.target + "_distances.txt"),
                    "w") as f:
                f.write(str(np.diag(D, k=1)))
示例#11
0
    def run(self) -> None:
        logging = cg.logging(self)
        samples = [x.fn for x in self.input()]
        max_cluster_id = 0
        cluster_ids: List[int] = []
        original_ids: List[int] = []
        samples_per_cell: List[str] = []

        celltypes_summary_file = os.path.join(
            am.paths().build, "curated_L4",
            "celltypes_summary_leaforder16-Dec-2017.xlsx")
        celltypes_summary = pd.read_excel(celltypes_summary_file)
        celltypes_dict = {
            celltypes_summary.columns.values[i]: celltypes_summary.values[:, i]
            for i in range(celltypes_summary.shape[1])
        }

        with self.output().temporary_path() as out_file:
            accessions = None  # type: np.ndarray
            for sample in samples:
                with loompy.connect(sample) as ds:
                    logging.info(f"Adding {ds.shape[1]} cells from {sample}")
                    target = os.path.basename(sample)[3:-5]
                    not_excluded = celltypes_dict["OriginalCluster"][
                        celltypes_dict["Bucket"] == target]
                    cells = np.where(np.isin(ds.ca.Clusters, not_excluded))[0]
                    for (ix, selection, view) in ds.scan(items=cells,
                                                         axis=1,
                                                         key="Accession"):
                        cluster_ids += list(view.ca.Clusters + max_cluster_id)
                        original_ids += list(view.ca.Clusters)
                        samples_per_cell += [sample] * selection.shape[0]
                        loompy.create_append(out_file,
                                             view.layers,
                                             view.ra,
                                             view.ca,
                                             fill_values="auto")
                    max_cluster_id = max(cluster_ids) + 1
                    logging.info(f"Found {max_cluster_id} clusters total")
            with loompy.connect(out_file) as ds:
                ds.ca.Clusters = np.array(cluster_ids)
                ds.ca.OriginalClusters = np.array(original_ids)
                ds.ca.Bucket = np.array(samples_per_cell)

                leaf_order = np.zeros(ds.shape[1], dtype='int') - 1
                le = LabelEncoder()
                le.fit(celltypes_dict["ClusterName"])
                new_clusters = np.zeros(ds.shape[1], dtype='int') - 1
                d = {}
                for attr in [
                        "LeafOrder", "Probable_location",
                        "Developmental_compartment", "Region", "Description",
                        "Location_based_on", "Neurotransmitter", "ClusterName",
                        "Taxonomy_group", "Comment", "ClusterName"
                ]:
                    d[attr] = np.array([""] * ds.shape[1], dtype=object)

                for ix in range(len(celltypes_dict["Bucket"])):
                    bucket = celltypes_dict["Bucket"][ix]
                    bucket_name = f"/Users/sten/build_20171205/L4_{bucket}.loom"
                    original_cluster = celltypes_dict["OriginalCluster"][ix]
                    cells = np.logical_and(
                        ds.ca.Bucket == bucket_name,
                        ds.ca.OriginalClusters == original_cluster)
                    leaf_order[cells] = celltypes_dict["LeafOrder"][ix]
                    new_clusters[cells] = le.transform(
                        [celltypes_dict["ClusterName"][ix]])
                    for attr in d.keys():
                        d[attr][cells] = celltypes_dict[attr][ix]

                logging.info(f"Found {new_clusters.max() + 1} clusters total")
                ds.ca.Clusters = new_clusters
                ds.ca.LeafOrder = leaf_order
                for key, vals in d.items():
                    ds.ca[key] = vals.astype("unicode")

                taxonomy_file = os.path.join(am.paths().build, "curated_L4",
                                             "Taxonomy.xlsx")
                taxonomy_table = pd.read_excel(taxonomy_file)
                taxonomy = {
                    taxonomy_table.values[i, 3]: taxonomy_table.values[i, :]
                    for i in range(taxonomy_table.shape[0])
                }

                tax1 = np.array([""] * ds.shape[1], dtype=object)
                tax2 = np.array([""] * ds.shape[1], dtype=object)
                tax3 = np.array([""] * ds.shape[1], dtype=object)
                tax4 = np.array([""] * ds.shape[1], dtype=object)
                taxs = np.array([""] * ds.shape[1], dtype=object)

                for i in range(ds.shape[1]):
                    if ds.ca.Clusters[i] == -1:
                        continue
                    tax1[i] = taxonomy[d["Taxonomy_group"][i]][0]
                    tax2[i] = taxonomy[d["Taxonomy_group"][i]][1]
                    tax3[i] = taxonomy[d["Taxonomy_group"][i]][2]
                    tax4[i] = taxonomy[d["Taxonomy_group"][i]][3]
                    taxs[i] = taxonomy[d["Taxonomy_group"][i]][4]
                ds.ca.TaxonomyRank1 = tax1
                ds.ca.TaxonomyRank2 = tax2
                ds.ca.TaxonomyRank3 = tax3
                ds.ca.TaxonomyRank4 = tax4
                ds.ca.TaxonomySymbol = taxs

                logging.info("Recomputing the list of valid genes")
                nnz = ds.map([np.count_nonzero], axis=0)[0]
                valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6)
                ds.ra._Valid = valid_genes.astype('int')

                logging.info("Learning the manifold")
                ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000)
                (knn, mknn, tsne) = ml.fit(ds)
                ds.col_graphs.KNN = knn
                ds.col_graphs.MKNN = mknn
                ds.ca._X = tsne[:, 0]
                ds.ca._Y = tsne[:, 1]
示例#12
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(
         os.path.join(
             am.paths().build,
             "L2_" + self.major_class + "_" + self.tissue + "_exported"))
示例#13
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(
         os.path.join(am.paths().samples, "classified",
                      "classifier.pickle"))
示例#14
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(os.path.join(am.paths().build, "L5_All.loom"))
示例#15
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            with loompy.connect(self.input().fn) as ds:
                cg.Aggregator().aggregate(ds, out_file)
                with loompy.connect(out_file) as dsagg:
                    for ix, score in enumerate(
                            dsagg.col_attrs["ClusterScore"]):
                        logging.info(f"Cluster {ix} score {score:.1f}")

                    logging.info("Computing auto-annotation")
                    aa = cg.AutoAnnotator(root=am.paths().autoannotation)
                    aa.annotate_loom(dsagg)
                    aa.save_in_loom(dsagg)

                    logging.info("Computing auto-auto-annotation")
                    n_clusters = dsagg.shape[1]
                    (selected, selectivity, specificity,
                     robustness) = cg.AutoAutoAnnotator(
                         n_genes=self.n_auto_genes).fit(dsagg)
                    dsagg.set_attr("MarkerGenes",
                                   np.array([
                                       " ".join(ds.ra.Gene[selected[:, ix]])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)
                    np.set_printoptions(precision=1, suppress=True)
                    dsagg.set_attr("MarkerSelectivity",
                                   np.array([
                                       str(selectivity[:, ix])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)
                    dsagg.set_attr("MarkerSpecificity",
                                   np.array([
                                       str(specificity[:, ix])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)
                    dsagg.set_attr("MarkerRobustness",
                                   np.array([
                                       str(robustness[:, ix])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)

                    tissue = self.tissue
                    labels = ds.col_attrs["Clusters"]

                    if self.tissue is "All":
                        dsagg.ca.Bucket = np.array([self.major_class] *
                                                   dsagg.shape[1])
                    else:
                        # Figure out which cells should be collected
                        cells: List[int] = []
                        # clusters_seen: List[int] = []  # Clusters for which there was some schedule
                        clusters_seen: Dict[int, str] = {}
                        schedule = pooling_schedule_L3[self.tissue]

                        # Where to send clusters when no rules match
                        _default_schedule: str = None
                        for aa_tag, sendto in schedule:
                            if aa_tag == "*":
                                _default_schedule = sendto

                        # For each cluster in the tissue
                        bucket_list = []
                        for ix, agg_aa in enumerate(dsagg.ca.AutoAnnotation):
                            # For each rule in the schedule
                            for aa_tag, sendto in schedule:
                                if aa_tag in agg_aa.split(","):
                                    if ix in clusters_seen:
                                        logging.info(
                                            f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto} (overruled by '{clusters_seen[ix]}')"
                                        )
                                    else:
                                        clusters_seen[
                                            ix] = f"{aa_tag} -> {sendto}"
                                        logging.info(
                                            f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto}"
                                        )
                                        bucket_list.append(sendto)
                            if ix not in clusters_seen:
                                if _default_schedule is None:
                                    logging.info(
                                        f"{tissue}/{ix}/{agg_aa}: No matching rule"
                                    )
                                    bucket_list.append("Excluded")
                                else:
                                    clusters_seen[
                                        ix] = f"{aa_tag} -> {_default_schedule}"
                                    logging.info(
                                        f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {_default_schedule}"
                                    )
                                    bucket_list.append(_default_schedule)
                        dsagg.ca.Bucket = np.array(bucket_list)
示例#16
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            ds = loompy.connect(self.input().fn)
            spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean",
                "Bucket": "mode",
                "Region": "first",
                "OriginalClusters": "first",
                "Probable_location": "first",
                "Developmental_compartment": "first",
                "Description": "first",
                "Location_based_on": "first",
                "Neurotransmitter": "first",
                "LeafOrder": "first",
                "Comment": "first",
                "ClusterName": "first",
                "TaxonomyRank1": "first",
                "TaxonomyRank2": "first",
                "TaxonomyRank3": "first",
                "TaxonomyRank4": "first",
                "TaxonomySymbol": "first"
            }
            cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec)
            dsagg = loompy.connect(out_file)

            logging.info("Computing auto-annotation")
            aa = cg.AutoAnnotator(root=am.paths().autoannotation)
            aa.annotate_loom(dsagg)
            aa.save_in_loom(dsagg)

            logging.info("Computing auto-auto-annotation")
            n_clusters = dsagg.shape[1]
            (selected, selectivity, specificity,
             robustness) = cg.AutoAutoAnnotator(
                 n_genes=self.n_auto_genes).fit(dsagg)
            dsagg.set_attr("MarkerGenes",
                           np.array([
                               " ".join(ds.ra.Gene[selected[:, ix]])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            np.set_printoptions(precision=1, suppress=True)
            dsagg.set_attr("MarkerSelectivity",
                           np.array([
                               str(selectivity[:, ix])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            dsagg.set_attr("MarkerSpecificity",
                           np.array([
                               str(specificity[:, ix])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            dsagg.set_attr("MarkerRobustness",
                           np.array([
                               str(robustness[:, ix])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            dsagg.close()
示例#17
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(
         os.path.join(am.paths().build, "L0_" + self.tissue + ".loom"))
示例#18
0
    def run(self) -> None:
        logging = cg.logging(self, True)
        dsout: loompy.LoomConnection = None
        accessions: loompy.LoomConnection = None
        with self.output().temporary_path() as out_file:
            logging.info("Gathering cells for " + self.target)
            enriched_markers: List[np.ndarray] = [
            ]  # The enrichment vector for each selected cluster
            cells_found = False
            for in_file, agg_file in self.input():
                tissue = os.path.basename(
                    in_file.fn).split("_")[2].split(".")[0]
                ds = loompy.connect(in_file.fn)
                dsagg = loompy.connect(agg_file.fn)
                enrichment = dsagg.layer["enrichment"][:, :]
                labels = ds.col_attrs["Clusters"]
                ordering: np.ndarray = None
                logging.info(tissue)

                # Figure out which cells should be collected
                cells: List[int] = []
                for fname in os.listdir(
                        os.path.join(am.paths().build, "curated_L2")):
                    if not fname.startswith("L2"):
                        continue
                    from_tissue = fname.split("_")[2]
                    if from_tissue != tissue:
                        continue
                    if tissue == "All":
                        major_class = fname.split("_")[1]
                        if major_class != self.target:
                            continue
                    logging.info("Gathering cells from " + in_file.fn)
                    logging.info("Gathering cells based on " + fname)
                    with open(
                            os.path.join(am.paths().build, "curated_L2",
                                         fname)) as f:
                        schedule = [x[:-1].split("\t") for x in f.readlines()]
                        for (cluster_str, n_cells, auto_target, curated_target,
                             comment) in schedule:
                            cluster = int(cluster_str)
                            if curated_target == self.target:
                                if accessions is None:
                                    accessions = ds.row_attrs["Accession"]
                                if ordering is None:
                                    ordering = np.where(
                                        ds.row_attrs["Accession"][
                                            None, :] == accessions[:, None])[1]
                                cells += list(np.where(labels == cluster)[0])
                                enriched_markers.append(
                                    np.argsort(-enrichment[:,
                                                           cluster][ordering]))

                if len(cells) > 0:
                    cells = np.sort(np.array(cells))
                    cells_found = True
                    for (ix, selection, view) in ds.scan(items=cells,
                                                         axis=1,
                                                         key="Accession"):
                        loompy.create_append(out_file, view.layers, view.ra,
                                             view.ca)

            if not cells_found:
                raise ValueError(
                    f"No cells matched any schedule for {self.target}")

            # Figure out which enriched markers to use
            ix = 0
            temp: List[int] = []
            while len(temp) < self.n_enriched:
                for j in range(len(enriched_markers)):
                    if enriched_markers[j][ix] not in temp:
                        temp.append(enriched_markers[j][ix])
                ix += 1
            genes = np.sort(np.array(temp))

            logging.info("Learning the manifold")
            with loompy.connect(out_file) as dsout:
                ml = cg.ManifoldLearning2(gtsne=True, alpha=1, genes=genes)
                (knn, mknn, tsne) = ml.fit(dsout)
                dsout.col_graphs.KNN = knn
                dsout.col_graphs.MKNN = mknn
                dsout.ca._X = tsne[:, 0]
                dsout.ca._Y = tsne[:, 1]

                logging.info("Clustering on the manifold")

                special_res = {
                    "Astrocytes": 0.6,
                    "Sensory_Neurons": 0.35,
                    "Brain_Granule": 0.6
                }
                r = 1.0
                if self.target in special_res:
                    r = special_res[self.target]

                pl = cg.PolishedLouvain(resolution=r)
                labels = pl.fit_predict(dsout)
                dsout.ca.Clusters = labels + 1
                dsout.ca.Outliers = (labels == -1).astype('int')
                logging.info(f"Found {labels.max() + 1} clusters")
示例#19
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(
         os.path.join(am.paths().build, "L3_" + self.target + ".agg.loom"))
示例#20
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(os.path.join(am.paths().build, f"F_Oligos"))
示例#21
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(
         os.path.join(am.paths().build, f"F_Neurogenesis"))
示例#22
0
    def run(self) -> None:
        # Load metadata
        metadata: np.ndarray = None
        meta_attrs: np.ndarray = None
        metadata_file = os.path.join(am.paths().samples, "metadata",
                                     "metadata.xlsx")
        if os.path.exists(metadata_file):
            temp = pd.read_excel(metadata_file)
            meta_attrs = temp.columns.values
            metadata = temp.values

        with self.output().temporary_path() as out_file:
            attrs = {"title": self.tissue}
            valid_cells = []
            sample_files = [s.fn for s in self.input()]
            for sample in sorted(sample_files):
                # Connect and perform file-specific cell validation
                with loompy.connect(sample) as ds:
                    logging.info("Marking invalid cells")
                    (mols, genes) = ds.map([np.sum, np.count_nonzero], axis=1)
                    valid_cells.append(
                        np.logical_and(mols >= 600,
                                       (mols / genes) >= 1.2).astype('int'))
                    ds.ca.Total = mols
                    ds.ca.NGenes = genes

                    logging.info("Computing mito/ribo ratio for " + sample)
                    mito = np.where(npstr.startswith(ds.ra.Gene, "mt-"))[0]
                    ribo = np.where(npstr.startswith(ds.ra.Gene, "Rpl"))[0]
                    ribo = np.union1d(
                        ribo,
                        np.where(npstr.startswith(ds.ra.Gene, "Rps"))[0])
                    if len(ribo) > 0 and len(mito) > 0:
                        mitox = ds[mito, :]
                        ribox = ds[ribo, :]
                        ratio = (mitox.sum(axis=0) + 1) / (ribox.sum(axis=0) +
                                                           1)
                        ds.ca.MitoRiboRatio = ratio

            logging.info("Creating combined loom file")
            loompy.combine(sample_files,
                           out_file,
                           key="Accession",
                           file_attrs=attrs)

            # Validating genes
            logging.info("Marking invalid genes")
            with loompy.connect(out_file) as ds:
                vgpath = os.path.join(am.paths().build, "genes.txt")
                if os.path.exists(vgpath):
                    valids = np.zeros(ds.shape[0])
                    with open(vgpath, "r") as f:
                        line = f.readline()
                        items = line[:-1].split("\t")
                        valids[np.where(ds.Accession == items[0])] = int(
                            items[1])
                    ds.set_attr("_Valids", valids, axis=0)
                else:
                    nnz = ds.map([np.count_nonzero], axis=0)[0]
                    valid_genes = np.logical_and(nnz > 10,
                                                 nnz < ds.shape[1] * 0.6)
                    ds.set_attr("_Valid", valid_genes, axis=0)

                logging.info("Marking invalid cells")
                ds.set_attr("_Valid", np.concatenate(valid_cells), axis=1)
                n_valid = np.sum(ds.col_attrs["_Valid"] == 1)
                n_total = ds.shape[1]
                logging.info("%d of %d cells were valid", n_valid, n_total)

                classifier_path = os.path.join(am.paths().samples,
                                               "classified",
                                               "classifier.pickle")
                if os.path.exists(classifier_path):
                    logging.info("Classifying cells by major class")
                    with open(classifier_path, "rb") as f:
                        clf = pickle.load(f)  # type: cg.Classifier
                    np.random.seed(13)
                    (classes, probs,
                     class_labels) = clf.predict(ds, probability=True)

                    mapping = {
                        "Astrocyte": "Astrocytes",
                        "Astrocyte,Cycling": "Astrocytes",
                        "Astrocyte,Immune": None,
                        "Astrocyte,Neurons": None,
                        "Astrocyte,Oligos": None,
                        "Astrocyte,Vascular": None,
                        "Bergmann-glia": "Astrocytes",
                        "Blood": "Blood",
                        "Blood,Cycling": "Blood",
                        "Blood,Vascular": None,
                        "Enteric-glia": "PeripheralGlia",
                        "Enteric-glia,Cycling": "PeripheralGlia",
                        "Ependymal": "Ependymal",
                        "Ex-Astrocyte": None,
                        "Ex-Blood": None,
                        "Ex-Immune": None,
                        "Ex-Neurons": None,
                        "Ex-Oligos": None,
                        "Ex-Vascular": None,
                        "Immune": "Immune",
                        "Immune,Neurons": None,
                        "Immune,Oligos": None,
                        "Neurons": "Neurons",
                        "Neurons,Cycling": "Neurons",
                        "Neurons,Immune": None,
                        "Neurons,Oligos": None,
                        "Neurons,Satellite-glia": None,
                        "OEC": "Astrocytes",
                        "Oligos": "Oligos",
                        "Oligos,Cycling": "Oligos",
                        "Oligos,Immune": None,
                        "Oligos,Vascular": None,
                        "Satellite-glia": "PeripheralGlia",
                        "Satellite-glia,Cycling": "PeripheralGlia",
                        "Schwann": "PeripheralGlia",
                        "Schwann,Cycling": "PeripheralGlia",
                        "Satellite-glia,Schwann": None,
                        "Ttr": "Ependymal",
                        "Vascular": "Vascular",
                        "Vascular,Cycling": "Vascular",
                        "Neurons,Vascular": None,
                        "Vascular,Oligos": None,
                        "Satellite-glia,Vascular": None,
                        "Unknown": None,
                        "Outliers": None
                    }

                    classes_pooled = np.array(
                        [str(mapping[c]) for c in classes], dtype=np.object_)
                    # mask invalid cells
                    classes[ds.col_attrs["_Valid"] == 0] = "Excluded"
                    classes_pooled[ds.col_attrs["_Valid"] == 0] = "Excluded"
                    classes_pooled[classes_pooled == "None"] = "Excluded"
                    ds.set_attr("Class", classes_pooled.astype('str'), axis=1)
                    ds.set_attr("Subclass", classes.astype('str'), axis=1)
                    for ix, cls in enumerate(class_labels):
                        ds.set_attr("ClassProbability_" + str(cls),
                                    probs[:, ix],
                                    axis=1)
                else:
                    logging.info(
                        "No classifier found in this build directory - skipping."
                    )
                    ds.set_attr("Class",
                                np.array(["Unknown"] * ds.shape[1]),
                                axis=1)
                    ds.set_attr("Subclass",
                                np.array(["Unknown"] * ds.shape[1]),
                                axis=1)
示例#23
0
 def output(self) -> luigi.Target:
     return luigi.LocalTarget(
         os.path.join(am.paths().build, f"L6_R{self.rank}_exported"))