def requires(self) -> Iterator[luigi.Task]: tissues: List[str] = [] for fname in os.listdir(os.path.join(am.paths().build, "curated_L2")): if not fname.startswith("L2"): continue tissue = fname.split("_")[2] with open(os.path.join(am.paths().build, "curated_L2", fname)) as f: schedule = [x[:-1].split("\t") for x in f.readlines()] for (cluster, n_cells, auto_target, curated_target, comment) in schedule: if curated_target == self.target: if tissue not in tissues: if tissue == "All": yield [ am.ClusterL2(tissue="All", major_class=self.target), am.AggregateL2(tissue="All", major_class=self.target) ] else: yield [ am.ClusterL2(tissue=tissue, major_class="Neurons"), am.AggregateL2(tissue=tissue, major_class="Neurons") ] tissues.append(tissue)
def output(self) -> luigi.LocalTarget: if am.paths().use_velocyto: fname = os.path.join(am.paths().samples, self.sample, "velocyto", self.sample + ".loom") return luigi.LocalTarget(fname) else: fname = os.path.join(am.paths().samples, self.sample, self.sample + ".loom") if os.path.exists(fname): return luigi.LocalTarget(fname) else: fname = os.path.join(am.paths().samples, self.sample + ".loom") return luigi.LocalTarget(fname)
def requires(self) -> Iterator[luigi.Task]: taxonomy_file = os.path.join(am.paths().build, "curated_L4", "Taxonomy.xlsx") taxonomy_table = pd.read_excel(taxonomy_file) taxonomy = {taxonomy_table.columns.values[i]: taxonomy_table.values[:, i] for i in range(taxonomy_table.shape[1])} for taxon in list(set(taxonomy["TaxonomyRank1"])): yield am.ExportL6(rank=1, taxon=taxon) for taxon in list(set(taxonomy["TaxonomyRank2"])): yield am.ExportL6(rank=2, taxon=taxon) for taxon in list(set(taxonomy["TaxonomyRank3"])): yield am.ExportL6(rank=3, taxon=taxon) # for taxon in list(set(taxonomy["TaxonomyRank4"])): # yield am.ExportL6(rank=4, taxon=taxon) for rank in [1, 2, 3, 4]: yield am.ExportByTaxonL6(rank=rank) yield am.ExportL5()
def run(self) -> None: with self.output().temporary_path() as fname: logging.info("Retraining classifier") pathname = os.path.join(am.paths().samples, "classified") clf = cg.Classifier(pathname, n_per_cluster=100) clf.generate() ds_training = loompy.connect( os.path.join(pathname, "classified.loom")) clf.fit(ds_training) with open(fname, "wb") as f: pickle.dump(clf, f) # Verify that it works (to catch some obscure intermittent UnicodeDecodeError) with open(fname, "rb") as f: clf = pickle.load(f) clf.aggregate_export()
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") with loompy.connect(self.input().fn) as ds: cg.Aggregator(self.n_markers).aggregate(ds, out_file) with loompy.connect(out_file) as dsagg: for ix, score in enumerate(dsagg.col_attrs["ClusterScore"]): logging.info(f"Cluster {ix} score {score:.1f}") logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator( n_genes=self.n_auto_genes).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1)
def run(self) -> None: logging = cg.logging(self, True) logging.info("Exporting cluster data") with self.output().temporary_path() as out_dir: if not os.path.exists(out_dir): os.mkdir(out_dir) with loompy.connect(self.input()[0].fn) as dsagg: logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_expression.tab")) dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_enrichment.tab"), layer="enrichment") dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_enrichment_q.tab"), layer="enrichment_q") dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_trinaries.tab"), layer="trinaries") ds = loompy.connect(self.input()[1].fn) logging.info("Plotting MKNN graph") cg.plot_knn(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.mknn.png")) # logging.info("Plotting Louvain resolution") # cg.plot_louvain(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.louvain.png")) try: logging.info("Plotting manifold graph with classes") cg.plot_classes(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.classes.png")) except Exception: pass logging.info("Plotting manifold graph with auto-annotation") tags = list(dsagg.col_attrs["AutoAnnotation"]) cg.plot_graph(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.aa.png"), tags) logging.info("Plotting manifold graph with auto-auto-annotation") tags = list(dsagg.col_attrs["MarkerGenes"]) cg.plot_graph(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.aaa.png"), tags) logging.info("Plotting marker heatmap") cg.plot_markerheatmap(ds, dsagg, n_markers_per_cluster=self.n_markers, out_file=os.path.join(out_dir, "L1_" + self.tissue + "_heatmap.pdf"))
def output(self) -> luigi.Target: return luigi.LocalTarget( os.path.join(am.paths().build, f"L6_R{self.rank}_({self.taxon}).loom"))
def output(self) -> luigi.Target: return luigi.LocalTarget( os.path.join( am.paths().build, f"L1_{self.tissue}_nfactors={self.n_factors}_k={self.k}_ksmoothing={self.k_smoothing}_a={self.a}_b={self.b}_c={self.c}_d={self.d}_log={self.log}_normalize={self.normalize}_accel={self.accel}" ))
def output(self) -> luigi.Target: return luigi.LocalTarget( os.path.join(am.paths().build, "L3_" + self.target + "_exported"))
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_dir: logging.info("Exporting cluster data") if not os.path.exists(out_dir): os.mkdir(out_dir) dsagg = loompy.connect(self.input()[0].fn) logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) dsagg.export( os.path.join(out_dir, "L3_" + self.target + "_expression.tab")) dsagg.export(os.path.join(out_dir, "L3_" + self.target + "_enrichment.tab"), layer="enrichment") dsagg.export(os.path.join( out_dir, "L3_" + self.target + "_enrichment_q.tab"), layer="enrichment_q") dsagg.export(os.path.join(out_dir, "L3_" + self.target + "_trinaries.tab"), layer="trinaries") logging.info("Plotting manifold graph with auto-annotation") tags = list(dsagg.col_attrs["AutoAnnotation"][np.argsort( dsagg.col_attrs["Clusters"])]) ds = loompy.connect(self.input()[1].fn) cg.plot_graph( ds, os.path.join(out_dir, "L3_" + self.target + "_manifold.aa.png"), tags) logging.info("Plotting manifold graph with auto-auto-annotation") tags = list(dsagg.col_attrs["MarkerGenes"][np.argsort( dsagg.col_attrs["Clusters"])]) cg.plot_graph( ds, os.path.join(out_dir, "L3_" + self.target + "_manifold.aaa.png"), tags) logging.info("Plotting marker heatmap") cg.plot_markerheatmap(ds, dsagg, n_markers_per_cluster=self.n_markers, out_file=os.path.join( out_dir, "L3_" + self.target + "_heatmap.pdf")) logging.info("Computing discordance distances") pep = 0.05 n_labels = dsagg.shape[1] def discordance_distance(a: np.ndarray, b: np.ndarray) -> float: """ Number of genes that are discordant with given PEP, divided by number of clusters """ return np.sum((1 - a) * b + a * (1 - b) > 1 - pep) / n_labels data = dsagg.layer["trinaries"][:n_labels * 10, :].T D = squareform(pdist(data, discordance_distance)) with open( os.path.join(out_dir, "L3_" + self.target + "_distances.txt"), "w") as f: f.write(str(np.diag(D, k=1)))
def run(self) -> None: logging = cg.logging(self) samples = [x.fn for x in self.input()] max_cluster_id = 0 cluster_ids: List[int] = [] original_ids: List[int] = [] samples_per_cell: List[str] = [] celltypes_summary_file = os.path.join( am.paths().build, "curated_L4", "celltypes_summary_leaforder16-Dec-2017.xlsx") celltypes_summary = pd.read_excel(celltypes_summary_file) celltypes_dict = { celltypes_summary.columns.values[i]: celltypes_summary.values[:, i] for i in range(celltypes_summary.shape[1]) } with self.output().temporary_path() as out_file: accessions = None # type: np.ndarray for sample in samples: with loompy.connect(sample) as ds: logging.info(f"Adding {ds.shape[1]} cells from {sample}") target = os.path.basename(sample)[3:-5] not_excluded = celltypes_dict["OriginalCluster"][ celltypes_dict["Bucket"] == target] cells = np.where(np.isin(ds.ca.Clusters, not_excluded))[0] for (ix, selection, view) in ds.scan(items=cells, axis=1, key="Accession"): cluster_ids += list(view.ca.Clusters + max_cluster_id) original_ids += list(view.ca.Clusters) samples_per_cell += [sample] * selection.shape[0] loompy.create_append(out_file, view.layers, view.ra, view.ca, fill_values="auto") max_cluster_id = max(cluster_ids) + 1 logging.info(f"Found {max_cluster_id} clusters total") with loompy.connect(out_file) as ds: ds.ca.Clusters = np.array(cluster_ids) ds.ca.OriginalClusters = np.array(original_ids) ds.ca.Bucket = np.array(samples_per_cell) leaf_order = np.zeros(ds.shape[1], dtype='int') - 1 le = LabelEncoder() le.fit(celltypes_dict["ClusterName"]) new_clusters = np.zeros(ds.shape[1], dtype='int') - 1 d = {} for attr in [ "LeafOrder", "Probable_location", "Developmental_compartment", "Region", "Description", "Location_based_on", "Neurotransmitter", "ClusterName", "Taxonomy_group", "Comment", "ClusterName" ]: d[attr] = np.array([""] * ds.shape[1], dtype=object) for ix in range(len(celltypes_dict["Bucket"])): bucket = celltypes_dict["Bucket"][ix] bucket_name = f"/Users/sten/build_20171205/L4_{bucket}.loom" original_cluster = celltypes_dict["OriginalCluster"][ix] cells = np.logical_and( ds.ca.Bucket == bucket_name, ds.ca.OriginalClusters == original_cluster) leaf_order[cells] = celltypes_dict["LeafOrder"][ix] new_clusters[cells] = le.transform( [celltypes_dict["ClusterName"][ix]]) for attr in d.keys(): d[attr][cells] = celltypes_dict[attr][ix] logging.info(f"Found {new_clusters.max() + 1} clusters total") ds.ca.Clusters = new_clusters ds.ca.LeafOrder = leaf_order for key, vals in d.items(): ds.ca[key] = vals.astype("unicode") taxonomy_file = os.path.join(am.paths().build, "curated_L4", "Taxonomy.xlsx") taxonomy_table = pd.read_excel(taxonomy_file) taxonomy = { taxonomy_table.values[i, 3]: taxonomy_table.values[i, :] for i in range(taxonomy_table.shape[0]) } tax1 = np.array([""] * ds.shape[1], dtype=object) tax2 = np.array([""] * ds.shape[1], dtype=object) tax3 = np.array([""] * ds.shape[1], dtype=object) tax4 = np.array([""] * ds.shape[1], dtype=object) taxs = np.array([""] * ds.shape[1], dtype=object) for i in range(ds.shape[1]): if ds.ca.Clusters[i] == -1: continue tax1[i] = taxonomy[d["Taxonomy_group"][i]][0] tax2[i] = taxonomy[d["Taxonomy_group"][i]][1] tax3[i] = taxonomy[d["Taxonomy_group"][i]][2] tax4[i] = taxonomy[d["Taxonomy_group"][i]][3] taxs[i] = taxonomy[d["Taxonomy_group"][i]][4] ds.ca.TaxonomyRank1 = tax1 ds.ca.TaxonomyRank2 = tax2 ds.ca.TaxonomyRank3 = tax3 ds.ca.TaxonomyRank4 = tax4 ds.ca.TaxonomySymbol = taxs logging.info("Recomputing the list of valid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6) ds.ra._Valid = valid_genes.astype('int') logging.info("Learning the manifold") ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000) (knn, mknn, tsne) = ml.fit(ds) ds.col_graphs.KNN = knn ds.col_graphs.MKNN = mknn ds.ca._X = tsne[:, 0] ds.ca._Y = tsne[:, 1]
def output(self) -> luigi.Target: return luigi.LocalTarget( os.path.join( am.paths().build, "L2_" + self.major_class + "_" + self.tissue + "_exported"))
def output(self) -> luigi.Target: return luigi.LocalTarget( os.path.join(am.paths().samples, "classified", "classifier.pickle"))
def output(self) -> luigi.Target: return luigi.LocalTarget(os.path.join(am.paths().build, "L5_All.loom"))
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") with loompy.connect(self.input().fn) as ds: cg.Aggregator().aggregate(ds, out_file) with loompy.connect(out_file) as dsagg: for ix, score in enumerate( dsagg.col_attrs["ClusterScore"]): logging.info(f"Cluster {ix} score {score:.1f}") logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator( n_genes=self.n_auto_genes).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) tissue = self.tissue labels = ds.col_attrs["Clusters"] if self.tissue is "All": dsagg.ca.Bucket = np.array([self.major_class] * dsagg.shape[1]) else: # Figure out which cells should be collected cells: List[int] = [] # clusters_seen: List[int] = [] # Clusters for which there was some schedule clusters_seen: Dict[int, str] = {} schedule = pooling_schedule_L3[self.tissue] # Where to send clusters when no rules match _default_schedule: str = None for aa_tag, sendto in schedule: if aa_tag == "*": _default_schedule = sendto # For each cluster in the tissue bucket_list = [] for ix, agg_aa in enumerate(dsagg.ca.AutoAnnotation): # For each rule in the schedule for aa_tag, sendto in schedule: if aa_tag in agg_aa.split(","): if ix in clusters_seen: logging.info( f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto} (overruled by '{clusters_seen[ix]}')" ) else: clusters_seen[ ix] = f"{aa_tag} -> {sendto}" logging.info( f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto}" ) bucket_list.append(sendto) if ix not in clusters_seen: if _default_schedule is None: logging.info( f"{tissue}/{ix}/{agg_aa}: No matching rule" ) bucket_list.append("Excluded") else: clusters_seen[ ix] = f"{aa_tag} -> {_default_schedule}" logging.info( f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {_default_schedule}" ) bucket_list.append(_default_schedule) dsagg.ca.Bucket = np.array(bucket_list)
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") ds = loompy.connect(self.input().fn) spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "_Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean", "Bucket": "mode", "Region": "first", "OriginalClusters": "first", "Probable_location": "first", "Developmental_compartment": "first", "Description": "first", "Location_based_on": "first", "Neurotransmitter": "first", "LeafOrder": "first", "Comment": "first", "ClusterName": "first", "TaxonomyRank1": "first", "TaxonomyRank2": "first", "TaxonomyRank3": "first", "TaxonomyRank4": "first", "TaxonomySymbol": "first" } cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec) dsagg = loompy.connect(out_file) logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator( n_genes=self.n_auto_genes).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.close()
def output(self) -> luigi.Target: return luigi.LocalTarget( os.path.join(am.paths().build, "L0_" + self.tissue + ".loom"))
def run(self) -> None: logging = cg.logging(self, True) dsout: loompy.LoomConnection = None accessions: loompy.LoomConnection = None with self.output().temporary_path() as out_file: logging.info("Gathering cells for " + self.target) enriched_markers: List[np.ndarray] = [ ] # The enrichment vector for each selected cluster cells_found = False for in_file, agg_file in self.input(): tissue = os.path.basename( in_file.fn).split("_")[2].split(".")[0] ds = loompy.connect(in_file.fn) dsagg = loompy.connect(agg_file.fn) enrichment = dsagg.layer["enrichment"][:, :] labels = ds.col_attrs["Clusters"] ordering: np.ndarray = None logging.info(tissue) # Figure out which cells should be collected cells: List[int] = [] for fname in os.listdir( os.path.join(am.paths().build, "curated_L2")): if not fname.startswith("L2"): continue from_tissue = fname.split("_")[2] if from_tissue != tissue: continue if tissue == "All": major_class = fname.split("_")[1] if major_class != self.target: continue logging.info("Gathering cells from " + in_file.fn) logging.info("Gathering cells based on " + fname) with open( os.path.join(am.paths().build, "curated_L2", fname)) as f: schedule = [x[:-1].split("\t") for x in f.readlines()] for (cluster_str, n_cells, auto_target, curated_target, comment) in schedule: cluster = int(cluster_str) if curated_target == self.target: if accessions is None: accessions = ds.row_attrs["Accession"] if ordering is None: ordering = np.where( ds.row_attrs["Accession"][ None, :] == accessions[:, None])[1] cells += list(np.where(labels == cluster)[0]) enriched_markers.append( np.argsort(-enrichment[:, cluster][ordering])) if len(cells) > 0: cells = np.sort(np.array(cells)) cells_found = True for (ix, selection, view) in ds.scan(items=cells, axis=1, key="Accession"): loompy.create_append(out_file, view.layers, view.ra, view.ca) if not cells_found: raise ValueError( f"No cells matched any schedule for {self.target}") # Figure out which enriched markers to use ix = 0 temp: List[int] = [] while len(temp) < self.n_enriched: for j in range(len(enriched_markers)): if enriched_markers[j][ix] not in temp: temp.append(enriched_markers[j][ix]) ix += 1 genes = np.sort(np.array(temp)) logging.info("Learning the manifold") with loompy.connect(out_file) as dsout: ml = cg.ManifoldLearning2(gtsne=True, alpha=1, genes=genes) (knn, mknn, tsne) = ml.fit(dsout) dsout.col_graphs.KNN = knn dsout.col_graphs.MKNN = mknn dsout.ca._X = tsne[:, 0] dsout.ca._Y = tsne[:, 1] logging.info("Clustering on the manifold") special_res = { "Astrocytes": 0.6, "Sensory_Neurons": 0.35, "Brain_Granule": 0.6 } r = 1.0 if self.target in special_res: r = special_res[self.target] pl = cg.PolishedLouvain(resolution=r) labels = pl.fit_predict(dsout) dsout.ca.Clusters = labels + 1 dsout.ca.Outliers = (labels == -1).astype('int') logging.info(f"Found {labels.max() + 1} clusters")
def output(self) -> luigi.Target: return luigi.LocalTarget( os.path.join(am.paths().build, "L3_" + self.target + ".agg.loom"))
def output(self) -> luigi.Target: return luigi.LocalTarget(os.path.join(am.paths().build, f"F_Oligos"))
def output(self) -> luigi.Target: return luigi.LocalTarget( os.path.join(am.paths().build, f"F_Neurogenesis"))
def run(self) -> None: # Load metadata metadata: np.ndarray = None meta_attrs: np.ndarray = None metadata_file = os.path.join(am.paths().samples, "metadata", "metadata.xlsx") if os.path.exists(metadata_file): temp = pd.read_excel(metadata_file) meta_attrs = temp.columns.values metadata = temp.values with self.output().temporary_path() as out_file: attrs = {"title": self.tissue} valid_cells = [] sample_files = [s.fn for s in self.input()] for sample in sorted(sample_files): # Connect and perform file-specific cell validation with loompy.connect(sample) as ds: logging.info("Marking invalid cells") (mols, genes) = ds.map([np.sum, np.count_nonzero], axis=1) valid_cells.append( np.logical_and(mols >= 600, (mols / genes) >= 1.2).astype('int')) ds.ca.Total = mols ds.ca.NGenes = genes logging.info("Computing mito/ribo ratio for " + sample) mito = np.where(npstr.startswith(ds.ra.Gene, "mt-"))[0] ribo = np.where(npstr.startswith(ds.ra.Gene, "Rpl"))[0] ribo = np.union1d( ribo, np.where(npstr.startswith(ds.ra.Gene, "Rps"))[0]) if len(ribo) > 0 and len(mito) > 0: mitox = ds[mito, :] ribox = ds[ribo, :] ratio = (mitox.sum(axis=0) + 1) / (ribox.sum(axis=0) + 1) ds.ca.MitoRiboRatio = ratio logging.info("Creating combined loom file") loompy.combine(sample_files, out_file, key="Accession", file_attrs=attrs) # Validating genes logging.info("Marking invalid genes") with loompy.connect(out_file) as ds: vgpath = os.path.join(am.paths().build, "genes.txt") if os.path.exists(vgpath): valids = np.zeros(ds.shape[0]) with open(vgpath, "r") as f: line = f.readline() items = line[:-1].split("\t") valids[np.where(ds.Accession == items[0])] = int( items[1]) ds.set_attr("_Valids", valids, axis=0) else: nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6) ds.set_attr("_Valid", valid_genes, axis=0) logging.info("Marking invalid cells") ds.set_attr("_Valid", np.concatenate(valid_cells), axis=1) n_valid = np.sum(ds.col_attrs["_Valid"] == 1) n_total = ds.shape[1] logging.info("%d of %d cells were valid", n_valid, n_total) classifier_path = os.path.join(am.paths().samples, "classified", "classifier.pickle") if os.path.exists(classifier_path): logging.info("Classifying cells by major class") with open(classifier_path, "rb") as f: clf = pickle.load(f) # type: cg.Classifier np.random.seed(13) (classes, probs, class_labels) = clf.predict(ds, probability=True) mapping = { "Astrocyte": "Astrocytes", "Astrocyte,Cycling": "Astrocytes", "Astrocyte,Immune": None, "Astrocyte,Neurons": None, "Astrocyte,Oligos": None, "Astrocyte,Vascular": None, "Bergmann-glia": "Astrocytes", "Blood": "Blood", "Blood,Cycling": "Blood", "Blood,Vascular": None, "Enteric-glia": "PeripheralGlia", "Enteric-glia,Cycling": "PeripheralGlia", "Ependymal": "Ependymal", "Ex-Astrocyte": None, "Ex-Blood": None, "Ex-Immune": None, "Ex-Neurons": None, "Ex-Oligos": None, "Ex-Vascular": None, "Immune": "Immune", "Immune,Neurons": None, "Immune,Oligos": None, "Neurons": "Neurons", "Neurons,Cycling": "Neurons", "Neurons,Immune": None, "Neurons,Oligos": None, "Neurons,Satellite-glia": None, "OEC": "Astrocytes", "Oligos": "Oligos", "Oligos,Cycling": "Oligos", "Oligos,Immune": None, "Oligos,Vascular": None, "Satellite-glia": "PeripheralGlia", "Satellite-glia,Cycling": "PeripheralGlia", "Schwann": "PeripheralGlia", "Schwann,Cycling": "PeripheralGlia", "Satellite-glia,Schwann": None, "Ttr": "Ependymal", "Vascular": "Vascular", "Vascular,Cycling": "Vascular", "Neurons,Vascular": None, "Vascular,Oligos": None, "Satellite-glia,Vascular": None, "Unknown": None, "Outliers": None } classes_pooled = np.array( [str(mapping[c]) for c in classes], dtype=np.object_) # mask invalid cells classes[ds.col_attrs["_Valid"] == 0] = "Excluded" classes_pooled[ds.col_attrs["_Valid"] == 0] = "Excluded" classes_pooled[classes_pooled == "None"] = "Excluded" ds.set_attr("Class", classes_pooled.astype('str'), axis=1) ds.set_attr("Subclass", classes.astype('str'), axis=1) for ix, cls in enumerate(class_labels): ds.set_attr("ClassProbability_" + str(cls), probs[:, ix], axis=1) else: logging.info( "No classifier found in this build directory - skipping." ) ds.set_attr("Class", np.array(["Unknown"] * ds.shape[1]), axis=1) ds.set_attr("Subclass", np.array(["Unknown"] * ds.shape[1]), axis=1)
def output(self) -> luigi.Target: return luigi.LocalTarget( os.path.join(am.paths().build, f"L6_R{self.rank}_exported"))