def save_enriched_motifs(df, fname: str) -> None: """ Save enriched motifs. Supported file formats are CSV, TSV, GMT, DAT (pickle), JSON or YAML. :param df: :param fname: :return: """ extension = PurePath(fname).suffixes if is_valid_suffix(extension, 'ctx'): df.to_csv(fname, sep=suffixes_to_separator(extension)) else: regulons = df2regulons(df) if '.json' in extension: name2targets = { r.name: list(r.gene2weight.keys()) for r in regulons } with openfile(fname, 'w') as f: f.write(json.dumps(name2targets)) elif '.dat' in extension: with openfile(fname, 'wb') as f: pickle.dump(regulons, f) elif '.gmt' in extension: GeneSignature.to_gmt(fname, regulons) elif is_valid_suffix(extension, 'ctx_yaml'): save_to_yaml(regulons, fname) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def test_intersection3(): gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75}) gs2 = GeneSignature(name="test1", gene2weight={'TP53': 0.3, 'SOX2': 0.60}) gsu = gs1.intersection(gs2) assert len(gsu) == 1 assert 'TP53' in gsu assert gsu.gene2weight['TP53'] == 0.8
def save_enriched_motifs(df, fname: str) -> None: """ Save enriched motifs. Supported file formats are CSV, TSV, GMT, DAT (pickle), JSON or YAML. :param df: :param fname: :return: """ extension = os.path.splitext(fname)[1].lower() if extension in FILE_EXTENSION2SEPARATOR.keys(): df.to_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension]) else: regulons = df2regulons(df) if extension == '.json': name2targets = { r.name: list(r.gene2weight.keys()) for r in regulons } with open(fname, 'w') as f: f.write(json.dumps(name2targets)) elif extension == '.dat': with open(fname, 'wb') as f: pickle.dump(regulons, f) elif extension == '.gmt': GeneSignature.to_gmt(fname, regulons) elif extension in {'.yaml', '.yml'}: save_to_yaml(regulons, fname) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def test_diff3(): gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75}) gs2 = GeneSignature(name="test1", gene2weight={'TP53': 0.3, 'SOX2': 0.60}) gsu = gs1.difference(gs2) assert 'SOX4' in gsu assert gsu.gene2weight['SOX4'] == 0.75 assert len(gsu) == 1
def test_union1(): gs1 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX4']) gs2 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX2']) gsu = gs1.union(gs2) assert 'TP53' in gsu assert 'SOX4' in gsu assert 'SOX2' in gsu assert len(gsu) == 3
def test_rename(): gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75}) gs2 = gs1.rename('test2') assert 'TP53' in gs2 assert 'SOX4' in gs2 assert gs2.name == 'test2' assert len(gs2) == 2 assert gs2.gene2weight['TP53'] == 0.5 assert gs2.gene2weight['SOX4'] == 0.75
def test_head(): gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75}) gs2 = gs1.head(1) assert gs2['TP53'] == 0.8 assert len(gs2) == 1 gs2 = gs1.head(2) assert gs2['TP53'] == 0.8 assert gs2['SOX4'] == 0.75 assert len(gs2) == 2
def test_union3(): gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75}) gs2 = GeneSignature(name="test1", gene2weight={'TP53': 0.3, 'SOX2': 0.60}) gsu = gs1.union(gs2) assert 'TP53' in gsu assert gsu.gene2weight['TP53'] == 0.8 assert 'SOX4' in gsu assert gsu.gene2weight['SOX4'] == 0.75 assert 'SOX2' in gsu assert gsu.gene2weight['SOX2'] == 0.6 assert len(gsu) == 3
def test_noweights(): gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75}) gs2 = gs1.noweights() assert gs1['TP53'] == 0.8 assert gs2['TP53'] == 1.0 reg1 = Regulon(name='TP53 regulon', gene2weight={'TP53': 0.8, 'SOX4': 0.75}, transcription_factor="TP53", gene2occurrence={"TP53": 1}) reg2 = reg1.noweights() assert reg1['TP53'] == 0.8 assert reg2['TP53'] == 1.0 assert isinstance(reg2, Regulon)
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]: """ Load genes signatures from disk. Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs). :param fname: The name of the file that contains the signatures. :return: A list of gene signatures. """ extension = os.path.splitext(fname)[1].lower() if extension in FILE_EXTENSION2SEPARATOR.keys(): return df2regulons( load_motifs(fname, sep=FILE_EXTENSION2SEPARATOR[extension])) elif extension in {'.yaml', '.yml'}: return load_from_yaml(fname) elif extension.endswith('.gmt'): sep = guess_separator(fname) return GeneSignature.from_gmt(fname, field_separator=sep, gene_separator=sep) elif extension == '.dat': with open(fname, 'rb') as f: return pickle.load(f) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]: """ Load genes signatures from disk. Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs). :param fname: The name of the file that contains the signatures. :return: A list of gene signatures. """ extension = PurePath(fname).suffixes if is_valid_suffix(extension, 'ctx'): # csv/tsv return df2regulons( load_motifs(fname, sep=suffixes_to_separator(extension))) elif is_valid_suffix(extension, 'ctx_yaml'): return load_from_yaml(fname) elif '.gmt' in extension: sep = guess_separator(fname) return GeneSignature.from_gmt(fname, field_separator=sep, gene_separator=sep) elif extension == '.dat': with openfile(fname, 'rb') as f: return pickle.load(f) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def gs(): return GeneSignature.from_gmt( TEST_SIGNATURE_FNAME, NOMENCLATURE, gene_separator="\t", field_separator="\t", )[0]
def test_load_gmt(): gss = GeneSignature.from_gmt(field_separator='\t', gene_separator='\t', fname=TEST_SIGNATURE_FNAME) # http://software.broadinstitute.org/gsea/msigdb/collections.jsp#C6 assert len(gss) == 189 assert gss[0].name == "GLI1_UP.V1_DN" assert "COPZ1" in gss[0] assert len(gss[0]) == 29
def aucell_command(args): """ Calculate regulon enrichment (as AUC values) for cells. """ LOGGER.info("Loading expression matrix.") ex_mtx = _load_expression_matrix(args) if any( args.regulons_fname.name.endswith(ext) for ext in FILE_EXTENSION2SEPARATOR.keys()): LOGGER.info("Creating regulons.") regulons = _df2regulons(args.regulons_fname.name, args.nomenclature) elif args.regulons_fname.name.endswith('.gmt'): LOGGER.info("Loading regulons.") regulons = GeneSignature.from_gmt(args.regulons_fname.name, args.nomenclature, field_separator='\t', gene_separator='\t') else: LOGGER.info("Loading regulons.") regulons = _load_modules(args.regulons_fname.name) LOGGER.info("Calculating enrichment.") auc_heatmap = aucell(ex_mtx, regulons, auc_threshold=args.auc_threshold, noweights=args.weights != 'yes', num_cores=args.num_workers) LOGGER.info("Writing results to file.") auc_heatmap.to_csv(args.output)
def test_init2(): gs1 = GeneSignature(name="test1", gene2weight=[('TP53', 0.5), ('SOX4', 0.75)]) assert 'TP53' in gs1 assert 'SOX4' in gs1 assert gs1.name == 'test1' assert len(gs1) == 2 assert gs1.gene2weight['TP53'] == 0.5 assert gs1.gene2weight['SOX4'] == 0.75
def test_init3(): gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75}) assert 'TP53' in gs1 assert 'SOX4' in gs1 assert gs1.name == 'test1' assert len(gs1) == 2 assert gs1.gene2weight['TP53'] == 0.5 assert gs1.gene2weight['SOX4'] == 0.75
def gmt2regions(gmt_fname, db_fname, delineation_code, fraction): db = RegionRankingDatabase(fname=db_fname, name=os.path.basename(db_fname)) signatures = GeneSignature.from_gmt(gmt_fname) delineation = CODE2DELINEATION[delineation_code] for signature in signatures: sys.stdout( signature.name + ',' + ','.join(convert(signature, db, delineation, fraction).genes))
def test_init1(): gs1 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX4']) assert 'TP53' in gs1 assert 'SOX4' in gs1 assert gs1.name == 'test1' assert len(gs1) == 2 assert gs1.gene2weight['TP53'] == 1.0 assert gs1.gene2weight['SOX4'] == 1.0
def test_aucell_mismatch(exp_matrix, gs): percentiles = derive_auc_threshold(exp_matrix) gss = [ GeneSignature(name="test", gene2weight=list(map("FAKE{}".format, range(100)))) ] + gs aucs_mtx = aucell(exp_matrix, gss, auc_threshold=percentiles[0.01], num_workers=1) print(aucs_mtx.head())
def load_modules(fname: str) -> Sequence[Type[GeneSignature]]: # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml # The alternative for which was opted in the end is binary pickling. extension = PurePath(fname).suffixes if is_valid_suffix(extension, 'ctx_yaml'): return load_from_yaml(fname) elif '.dat' in extension: with openfile(fname, 'rb') as f: return pickle.load(f) elif '.gmt' in extension: return GeneSignature.from_gmt(fname) else: raise ValueError("Unknown file format for \"{}\".".format(fname))
def load_modules(fname: str) -> Sequence[Type[GeneSignature]]: # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml # The alternative for which was opted in the end is binary pickling. if fname.endswith('.yaml') or fname.endswith('.yml'): return load_from_yaml(fname) elif fname.endswith('.dat'): with open(fname, 'rb') as f: return pickle.load(f) elif fname.endswith('.gmt'): sep = guess_separator(fname) return GeneSignature.from_gmt(fname, field_separator=sep, gene_separator=sep) else: raise ValueError("Unknown file format for \"{}\".".format(fname))
def calculate_regulon_enrichment(self): # Calculate regulon enrichment per cell using AUCell. # Create regulons with weight based on given key print("Using {} to weight the genes when running AUCell.".format( self.auc_regulon_weights_key)) regulon_signatures = list( map( lambda x: GeneSignature( name=x.name, gene2weight=self.get_regulon_gene_data( x, self.auc_regulon_weights_key), ), self.regulons, )) auc_mtx = aucell( self.ex_mtx, regulon_signatures, num_workers=self.num_workers) # (n_cells x n_regulons) auc_mtx = auc_mtx.loc[self.ex_mtx.index] return auc_mtx
def signatures(): for gene_sig_file_path in gene_sig_file_paths: gene_sig = pd.read_csv(gene_sig_file_path, sep='\t', header=None, index_col=None) fname = ntpath.basename(gene_sig_file_path) regulon = os.path.splitext(fname)[0] # Check if the file is the regulon frequency file if regulon == 'regulons': continue # Do some sanity checks if len(gene_sig.columns) == 0: raise Exception( f"{gene_sig_file_path} has 0 columns. Requires .tsv with 1 or 2 columns. First column should be genes (required), second (optional) are weight for the given genes." ) if len(gene_sig.columns) > 2: raise Exception( f"{gene_sig_file_path} has more than 2 columns. Requires .tsv with 1 or 2 columns. First column should be genes, second (optional) are weight for the given genes." ) if len(gene_sig.columns) == 1 or noweights: gene2weight = gene_sig[0] if len(gene_sig.columns) == 2 and not noweights: # Filter the genes based on the given weight_threshold # 1st column: genes # 2nd column: weights gene_sig = gene_sig[gene_sig[1] > weight_threshold] if len(gene_sig.index) == 0: if show_warnings: warnings.warn( "{0} is empty after apply filter with weight_threshold > {1}" .format(regulon, weight_threshold)) continue gene2weight = [tuple(x) for x in gene_sig.values] yield GeneSignature(name=regulon, gene2weight=gene2weight)
def doGeneSetEnrichment(self, request, context): gene_set_file_path = os.path.join(self.dfh.get_gene_sets_dir(), request.geneSetFilePath) loom = self.lfh.get_loom(loom_file_path=request.loomFilePath) gse = _gse.GeneSetEnrichment(scope=self, method="AUCell", loom=loom, gene_set_file_path=gene_set_file_path, annotation='') # Running AUCell... yield gse.update_state(step=-1, status_code=200, status_message="Running AUCell...", values=None) time.sleep(1) # Reading gene set... yield gse.update_state(step=0, status_code=200, status_message="Reading the gene set...", values=None) with open(gse.gene_set_file_path, 'r') as f: # Skip first line because it contains the name of the signature gs = GeneSignature(name='Gene Signature #1', gene2weight=[ line.strip() for idx, line in enumerate(f) if idx > 0 ]) time.sleep(1) if not gse.has_AUCell_rankings(): # Creating the matrix as DataFrame... yield gse.update_state(step=1, status_code=200, status_message="Creating the matrix...", values=None) loom = self.lfh.get_loom(loom_file_path=request.loomFilePath) dgem = np.transpose(loom.get_connection()[:, :]) ex_mtx = pd.DataFrame(data=dgem, index=loom.get_ca_attr_by_name("CellID"), columns=loom.get_genes()) # Creating the rankings... start_time = time.time() yield gse.update_state(step=2.1, status_code=200, status_message="Creating the rankings...", values=None) rnk_mtx = create_rankings(ex_mtx=ex_mtx) # Saving the rankings... yield gse.update_state(step=2.2, status_code=200, status_message="Saving the rankings...", values=None) lp.create(gse.get_AUCell_ranking_filepath(), rnk_mtx.as_matrix(), {"CellID": loom.get_cell_ids()}, {"Gene": loom.get_genes()}) print("Debug: %s seconds elapsed ---" % (time.time() - start_time)) else: # Load the rankings... yield gse.update_state( step=2, status_code=200, status_message="Rankings exists: loading...", values=None) rnk_loom = self.lfh.get_loom_connection( gse.get_AUCell_ranking_filepath()) rnk_mtx = pd.DataFrame(data=rnk_loom[:, :], index=rnk_loom.ra.CellID, columns=rnk_loom.ca.Gene) # Calculating AUCell enrichment... start_time = time.time() yield gse.update_state( step=3, status_code=200, status_message="Calculating AUCell enrichment...", values=None) aucs = enrichment(rnk_mtx, gs).loc[:, "AUC"].values print("Debug: %s seconds elapsed ---" % (time.time() - start_time)) yield gse.update_state(step=4, status_code=200, status_message=gse.get_method() + " enrichment done!", values=aucs)
def test_diff1(): gs1 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX4']) gs2 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX2']) gsu = gs1.difference(gs2) assert 'SOX4' in gsu assert len(gsu) == 1
def test_add(): gss = GeneSignature.from_gmt(field_separator='\t', gene_separator='\t', fname=TEST_SIGNATURE_FNAME) res = gss[0].add("MEF2") assert "MEF2" in res
def test_immut(): gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75}) with pytest.raises(attr.exceptions.FrozenInstanceError): gs1.name = 'rename' with pytest.raises(TypeError): gs1.gene2weight['TP53'] = 0.6
def test_dict(): gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75}) assert gs1['TP53'] == 0.5 assert gs1['SOX4'] == 0.75
def test_genes(): gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75}) assert gs1.genes == ('SOX4', 'TP53')