def vec_enrich(vec, gene_ids, quantile, gene_sets): ind = np.quantile(vec, quantile) > vec print("... {} features selected...".format(sum(ind))) genes = gene_ids[ind] # remove ens id version genes = [re.sub("\\..*$", "", g) for g in genes] print("Mapping to gene names...") # map ens ids to gene symbols bm = Biomart() bm_result = bm.query( dataset="hsapiens_gene_ensembl", attributes=[ "ensembl_gene_id", "external_gene_name", "entrezgene_id", "go_id", ], filters={"ensembl_gene_id": genes}, ) gene_symbols = list(bm_result["external_gene_name"].unique()) print("Calculating enrichment...") enr = gp.enrichr( gene_list=gene_symbols, gene_sets=gene_sets, organism="Human", cutoff=0.05, ) return enr
def get_background(self): """get background gene""" # input is a file if os.path.isfile(self.background): with open(self.background) as b: bg2 = b.readlines() bg = [g.strip() for g in bg2] return set(bg) # package included data DB_FILE = resource_filename("gseapy", "data/{}.background.genes.txt".format(self.background)) if os.path.exists(DB_FILE): df = pd.read_csv(DB_FILE,sep="\t") else: # background is a biomart database name self._logger.warning("Downloading %s for the first time. It might take a couple of miniutes."%self.background) bm = Biomart() df = bm.query(dataset=self.background) df.dropna(subset=['go_id'], inplace=True) self._logger.info("Using all annotated genes with GO_ID as background: %s"%self.background) df.dropna(subset=['entrezgene_id'], inplace=True) # input id type: entrez or gene_name if self._isezid: bg = df['entrezgene_id'].astype(int) else: bg = df['external_gene_name'] return set(bg)
def get_background(self): """get background gene""" DB_FILE = resource_filename("gseapy", "data/{}.background.genes.txt".format(self.background)) filename = os.path.join(DEFAULT_CACHE_PATH, "{}.background.genes.txt".format(self.background)) if os.path.exists(filename): df = pd.read_table(filename) elif os.path.exists(DB_FILE): df = pd.read_table(DB_FILE) else: self._logger.warning("Downloading %s for the first time. It might take a couple of miniutes."%self.background) bm = Biomart() df = bm.query(dataset=self.background) df.dropna(subset=['go_id'], inplace=True) self._logger.info("using all annotated genes with GO_ID as background genes") df.dropna(subset=['entrezgene'], inplace=True) return df
adata = sc.AnnData(adata) adata.obs["group"]=group.astype(np.str) # rank gene by importance for clusters sc.tl.rank_genes_groups(adata, "group", n_genes = 500) r = adata.uns['rank_genes_groups']['names'] #pd.DataFrame.from_records(r).to_csv('marker_genes_group.csv') #", ".join(pd.DataFrame.from_records(r)['1'].values) for x in pd.DataFrame.from_records(r).columns: print("group :"+x, end = "\r") # rank gene by importance for clusters glist = pd.DataFrame.from_records(r)[x].tolist() bm = Biomart() if not os.path.exists("test"): os.makedirs("test") results = bm.query(dataset='hsapiens_gene_ensembl', attributes=['external_gene_name', 'go_id'], filters={'hgnc_symbol': glist}, # save output file filename="test/query_"+x+".results.txt") enr = gp.enrichr(gene_list=glist, description='test_name', gene_sets=['KEGG_2016'], outdir="test/enrichr_kegg_group"+x, cutoff=0.5 # test dataset, use lower value from range(0,1) )
def main(): GO_name = "TEST" args = parser.parse_args() input = args.input # "score_D.txt" p_threshold = float(args.threshold) # 0.05 num = int(args.number) ############ # get GENE_COMM and final_score_D GENE_COMM = [] final_score_D = [] file_D = input fp = open(file_D, "r") line = fp.readline() line = fp.readline() while (line != ""): words = line.split("\t") GENE_COMM.append(words[0]) final_score_D.append(words[-1].strip()) line = fp.readline() fp.close() ########### # sort data and get top 500 genes data = np.array([GENE_COMM, final_score_D]) idex = np.lexsort([data[1, :]]) data_sorted = data[:, idex[::-1]] GENES_top = data_sorted[0, :num] SCORES_top = data_sorted[1, :num] ################ ## use BioMart and convert ensembl_id to gene_symbol bm = Biomart(verbose=False, host="asia.ensembl.org") marts = bm.get_marts() datasets = bm.get_datasets(mart='ENSEMBL_MART_ENSEMBL') attrs = bm.get_attributes(dataset='hsapiens_gene_ensembl') filters = bm.get_filters(dataset='hsapiens_gene_ensembl') results = bm.query(dataset='hsapiens_gene_ensembl', attributes=["ensembl_gene_id", "hgnc_symbol"], filters={'ensembl_gene_id': GENES_top.tolist()}, filename="query.results.txt") gene_List = [] hgnc_symbol_list = results.hgnc_symbol.tolist() for gene in hgnc_symbol_list: if (isinstance(gene, str)): gene_List.append(gene) libs = [ "GO_Biological_Process_2018", "GO_Cellular_Component_2018", "GO_Molecular_Function_2018" ] gseapy.enrichr(gene_list=gene_List, description=GO_name, gene_sets=libs, outdir=GO_name) ########## # fig for GO enrichment analysis. the threshold of adjusted p-value is less than 0.05, calculate -lg(adjusted p-value) and use gene num TERMS = [] P_VALUE = [] GENES_NUM = [] file_GO = "GO_reports.txt" fp_w = open(file_GO, "w") mark = 0 for lib in libs: GO_file = GO_name + "/" + lib + "." + GO_name + ".enrichr.reports.txt" fp = open(GO_file, "r") line = fp.readline() if (mark == 0): fp_w.write(line) mark = 1 line = fp.readline() while (line != ""): fp_w.write(line) words = line.split("\t") term = words[1] p_value = float(words[4]) genes_num = len(words[4].split(";")) if (p_value < p_threshold): TERMS.append(term.split(" ")[-1].strip('(').strip(')')) P_VALUE.append(p_value) GENES_NUM.append(genes_num) line = fp.readline() fp.close() fp_w.close() ######## GO_value = [] for value in P_VALUE: GO_value.append(-math.log(value, 10)) ############## fig = plt.figure(figsize=(9, 6)) matplotlib.rcParams['font.sans-serif'] = ['SimHei'] matplotlib.rcParams['axes.unicode_minus'] = False plt.barh(range(len(GO_value)), GO_value, height=0.7, color='steelblue', alpha=0.8) plt.yticks(range(len(GO_value)), TERMS) plt.xlim(0, 10) plt.xlabel("-LgP") plt.title("GO enrichment") for x, y in enumerate(GO_value): plt.text(y + 0.2, x - 0.1, '%s' % y) plt.show() fig.savefig("GO enrichment barh.png") ####### fig = plt.figure(figsize=(9, 6)) cm = plt.cm.get_cmap('RdYlGn') NUM = [] for num in GENES_NUM: NUM.append(100 * num) sc = plt.scatter(GO_value, TERMS, c=GO_value, vmin=0, s=NUM, cmap=cm) plt.colorbar(sc) plt.show() fig.savefig("GO enrichment scatter.png")