def run_gprofiler(inputfile, theargs): """ todo :param inputfile: :return: """ genes = read_inputfile(inputfile) gp = GProfiler(return_dataframe=True) genes = genes.strip(',').strip('\n').split(',') df_result = gp.profile(query=genes, organism=theargs.organism, user_threshold=theargs.maxpval) if df_result.shape[0] == 0: sys.stderr.write('No terms found\n') return 0 df_result['Jaccard'] = 1.0 / (1.0 / df_result['precision'] + 1.0 / df_result['recall'] - 1) df_result.sort_values(['Jaccard', 'p_value'], ascending=[False, True], inplace=True) df_result.reset_index(drop=True, inplace=True) top_hit = df_result['name'][0] sys.stdout.write(top_hit) return 0
def run_gprofiler(inputfile, theargs): """ todo :param inputfile: :return: """ genes = read_inputfile(inputfile) gp = GProfiler(return_dataframe=True) genes = genes.strip(',').strip('\n').split(',') df_result = gp.profile(query=genes, organism=theargs.organism, user_threshold=theargs.maxpval, no_evidences=False) if df_result.shape[0] == 0: sys.stderr.write('No terms found\n') return 0 df_result['Jaccard'] = 1.0 / (1.0 / df_result['precision'] + 1.0 / df_result['recall'] - 1) df_result.sort_values(['Jaccard', 'p_value'], ascending=[False, True], inplace=True) df_result.reset_index(drop=True, inplace=True) theres = { 'name': df_result['name'][0], 'source': df_result['source'][0], 'p_value': df_result['p_value'][0], 'description': df_result['description'][0], 'intersections': df_result['intersections'][0] } json.dump(theres, sys.stdout) sys.stdout.flush() return 0
def add_gene_name_gprofiler(data_df, col, organism): print(type(data_df[col].tolist())) gp = GProfiler(return_dataframe=True) # details of what returns the following function : caleydo.org/tools/ res = gp.convert(organism=organism, query=data_df[col].tolist(), target_namespace='UNIPROTSWISSPROT') # now add the relevant results to dataframe res_f = res[['incoming', 'name', 'namespaces']] res_f.rename(columns={"incoming":col, "name": "gene_name", "namespaces": "gene_name_bank"}, inplace=True) res_f = res_f.replace({'UNIPROTSWISSPROT,UNIPROT_GN_ACC':'Swiss-Prot', 'UNIPROTSPTREMBL,UNIPROT_GN_ACC':'TrEMBL'}) df = data_df.merge(res_f, how='left', on=col) # TODO check if concordant with description #df['OK'] = np.where(df['gene_name_PD'] == df['converted_gprofiler'], True, False) print(df) return df
def get_gene_list(self, samples_stat): for sample in samples_stat: gene = samples_stat[sample]['gene'] if len(gene) == 0: continue else: gp = GProfiler(user_agent='ExampleTool', return_dataframe=True) df = gp.profile(organism='hsapiens', query=gene) go = df[df['native'].str.contains('GO')] go.to_csv('{module}/{sample}/GO_FuncTerm.csv'.format( module=self.module, sample=sample), header=True, index=False, sep=',') self.plot_go(go, sample, 'GO') kegg = df[df['native'].str.contains('KEGG')] kegg.to_csv('{module}/{sample}/KEGG_FuncTerm.csv'.format( module=self.module, sample=sample), header=True, index=False, sep=',') self.plot_go(kegg, sample, 'KEGG') df = gp.convert(organism='hsapiens', query=gene, target_namespace='ENTREZGENE_ACC') df.to_csv('{module}/{sample}/Entrez_Gene_converted.csv'.format( module=self.module, sample=sample), header=True, index=False, sep=',') with open( '{module}/{sample}/gene_list.txt'.format( module=self.module, sample=sample), 'wt') as f: f.write('\n'.join(gene))
def get_gene_names(geneList): gp = GProfiler(return_dataframe=True) df = gp.convert(organism='athaliana', query=geneList)[['incoming', 'name', 'description']] df['description'] = df.apply( lambda x: x['description'].split('[')[0].split(';')[0], axis=1) return df
def add_gene_name_gprofiler(data_df: pd.DataFrame, col: str, organism: str) -> pd.DataFrame: gp = GProfiler(return_dataframe=True) protein_list = data_df[col].tolist() # details of what returns the following function : https://pypi.org/project/gprofiler-official/ # TODO : documentation res = gp.convert(organism=organism, query=protein_list, target_namespace='UNIPROTSWISSPROT') # now add the relevant results to dataframe res_f = res[['incoming', 'name', 'namespaces']] res_f.rename(columns={ "incoming": col, "name": "gene_name", "namespaces": "gene_name_bank" }, inplace=True) res_f = res_f.replace({ 'UNIPROTSWISSPROT,UNIPROT_GN_ACC': 'Swiss-Prot', 'UNIPROTSPTREMBL,UNIPROT_GN_ACC': 'TrEMBL' }) df = data_df.merge(res_f, how='left', on=col) # gProfiler returns one line for each alias of the gene (as in alias section in Uniprot): keep only the first one df = df[~df['Accession'].duplicated(keep='first')] return df
def gene_name_annotation_short(genes): gp = GProfiler(return_dataframe=True) gene_annot = gp.convert(organism='mmusculus', query= genes, target_namespace='ENTREZGENE_ACC') gene_annot['short_description'] = gene_annot['description'].map(lambda x: re.sub('\[.+\]', '', x)) # delete extra text between [] gene_annot = gene_annot.drop(['description','name', 'converted','n_incoming','n_converted', 'namespaces', 'query'], axis=1) return gene_annot
def gene_name_annotation_long(genes): gp = GProfiler(return_dataframe=True) gene_annot = gp.convert(organism='mmusculus', query= genes, target_namespace='ENTREZGENE_ACC') gene_annot['short_description'] = gene_annot['description'].map(lambda x: re.sub('\[.+\]', '', x)) # delete extra text between [] gene_annot = gene_annot.drop(['description','name', 'converted','n_incoming','n_converted', 'namespaces', 'query'], axis=1) with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print all lines return display(gene_annot)
def run_gProfiler(comp, org): gp = GProfiler( return_dataframe=True) #return pandas dataframe or plain python structures #gp = GProfiler(user_agent = 'lisa' ) list_id = [] for name in list(set(comp["composite"])): i_d = name.split('|')[1] list_id.append(i_d) res = gp.profile(organism=org,domain_scope = "annotated", sources = ["GO", "KEGG", "REACTOME"], #exemple org : hsapiens query=list_id) return res
def pathway_enrich_genes(genes, databases): gp = GProfiler(return_dataframe=True, user_agent='g:GOSt') cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05, significance_threshold_method='fdr', query=genes, #"contains the list of enriched genes" no_evidences=False) cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[1,2,5,10,13]] pd.set_option("display.max_colwidth", 800) return cluster_enrichment_results.iloc[:10,:]
def command(self, gene_list, n_top): from gprofiler import GProfiler import numpy as np gp = GProfiler("") r0 = gp.gprofile(gene_list, correction_method=GProfiler.THR_FDR, ordered=True) r0 = np.array(r0) r0 = r0[r0[:, 9] == 'MF'] name_out = r0[0:n_top, -3] p_out = r0[0:n_top, 2] return np.array([x for x in zip(name_out, p_out)])
def pathway_enrich(genes, databases): gp = GProfiler(return_dataframe=True, user_agent='g:GOSt') cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05, significance_threshold_method='fdr', domain_scope ='annotated', #background= 10000, query= genes) #"contains the list of enriched genes" cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[2,5,7,10,1]] pd.set_option("display.max_colwidth", 800) return cluster_enrichment_results.iloc[:10,:]
def command(self, gene_list, n_top): from gprofiler import GProfiler import numpy as np gp = GProfiler("") r0 = gp.gprofile(gene_list,correction_method=GProfiler.THR_FDR,ordered=True) r0 = np.array(r0) r0 = r0[r0[:,9]=='MF'] name_out = r0[0:n_top,-3] p_out = r0[0:n_top,2] data = np.array([x for x in zip(name_out, p_out)]) dataframe = pd.DataFrame(data=data, columns=["Name", "p-values"]) return dataframe
def Functional_profiling(gene_list, organism='hsapiens', sources=[ "GO:MF", "GO:CC", "GO:BP", "KEGG", "REAC", "WP", "TF", "MIRNA", "HPA", "CORUM", "HP" ], user_threshold=0.05): gp = GProfiler(return_dataframe=True) gp_result_df = gp.profile(query=gene_list, organism=organism, user_threshold=user_threshold, no_iea=True, sources=sources) return gp_result_df
def profile_genes_with_active_sites(enriched_genes, background=None) -> DataFrame: if len(enriched_genes) == 0: return DataFrame() gp = GProfiler('ActiveDriverDB', want_header=True) response = gp.gprofile(enriched_genes, custom_bg=background) if not response: return DataFrame() header, *rows = response return DataFrame(rows, columns=header)
def execute(self): d = pd.read_excel(self.args["<markers_spreadsheet>"], sheet_name=None) output_spreadsheet = self.args['<output_spreadsheet>'] organism = self.args["--organism"] enrichment_threshold = float(self.args["--enrichment_threshold"]) max_genes = int(self.args['--max_genes']) from gprofiler import GProfiler gp = GProfiler(return_dataframe=True) query = {} for key in d.keys(): features = d[key]['feature'].values.tolist() query[key] = features[0:max_genes] result = gp.profile(organism=organism, query=query, user_threshold=enrichment_threshold) result.to_excel(output_spreadsheet, index=False)
def pathway_enrich_plot(genes, databases, title, background_genes, name_output, save: bool = False): """A function to plot the signature enrichment as a bargraph. # Inputs: # genes - list of genes to be probed # databases - which databases to query, more information can be found here: https://biit.cs.ut.ee/gprofiler/page/apis # title - title for figure # background_genes - all the # save - if I want to save the the figure # """ #Interpretation of differentially expressed genes in cluster 0 cells - g:profiler gp = GProfiler(return_dataframe=True, user_agent='g:GOSt') cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05, significance_threshold_method='fdr', background= background_genes, query=genes) #"contains the list of enriched genes" cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[2,5,7,10,1]] # made new column with negative log p-value cluster_enrichment_results['-log10_p_value'] = cluster_enrichment_results['p_value'].map(lambda x: -math.log(x,10)) if 'REAC:0000000' in cluster_enrichment_results.index.tolist(): cluster_enrichment_results = cluster_enrichment_results.drop(labels='REAC:0000000', axis=0) plt.rcdefaults() fig, ax = plt.subplots() cluster_name = cluster_enrichment_results['name'].head(10) y_pos = np.arange(len(cluster_name)) enrichment_value = cluster_enrichment_results['-log10_p_value'].head(10) ax.barh(y_pos, enrichment_value, align='center', color='black') ax.set_yticks(y_pos) ax.set_yticklabels(cluster_name) ax.invert_yaxis() # labels read top-to-bottom ax.set_xlabel('-log10 p value') ax.set_title(title) if save: plt.savefig(name_output, format='pdf', bbox_inches = "tight") return plt.show()
def find_orth_gene(gene, organism, target): """Find orthogonal gene via Gprofiler returns a list of gene(s)""" import pandas as pd from gprofiler import GProfiler gp = GProfiler() if organism == target: # do not search if original gene is known genes = list(set([ gene, ])) else: results = pd.DataFrame( gp.orth(query=gene, organism=organism, target=target)) results.dropna(subset=["name"], axis=0) genes = [gene for gene in results.name if gene != "N/A"] return genes
def enrichment_analysis(markers: Dict[str, Dict[str, pd.DataFrame]], max_genes: int = 100, organism: str = 'hsapiens', enrichment_threshold: float = 0.05) -> pd.DataFrame: """Perform enrichment analysis using gprofiler (https://biit.cs.ut.ee/gprofiler/gost). Parameters ---------- markers: ``Dict[str, Dict[str, pd.DataFrame]`` Output from markers. max_genes: ``int``, optional, default: 100 Maximum number of genes to use in enrichment query organism: ``str``, optional, default: ``hsapiens`` Organism. See https://biit.cs.ut.ee/gprofiler/page/organism-list for full list. enrichment_threshold: ``float``, optional, default: ``0.05`` Include enrichment results with corrected p-value less than this threshold Returns ------- ``pd.DataFrame`` """ start = time.perf_counter() from gprofiler import GProfiler gp = GProfiler(return_dataframe=True) query = {} for cluster in markers.keys(): up_list = markers[cluster]['up'].index.values.tolist() if len(up_list) > 0: query[cluster + '-up'] = up_list[0:max_genes] down_list = markers[cluster]['down'].index.values.tolist() if len(down_list) > 0: query[cluster + '-down'] = down_list[0:max_genes] result = gp.profile(organism=organism, query=query, user_threshold=enrichment_threshold) end = time.perf_counter() logger.info( "Enrichment analysis is finished. Time spent = {:.2f}s.".format(end - start)) return result
def enrich_and_simplify(sets, intersections=True, sources=('GO:BP', ), organism='hsapiens', reduce_limit=0, **revigo_kwds): from gprofiler import GProfiler if not isinstance(sets, dict): sets = list(sets) gprofiler = GProfiler(user_agent="scanpy", return_dataframe=True) gprofiler_kwargs = {'no_evidences': not intersections, 'sources': sources} df = gprofiler.profile(sets, organism=organism, **gprofiler_kwargs) revs = {} if reduce_limit is not None: dfs = [] for q in df['query'].unique(): df_sub = df[df['query'] == q].copy() go = df_sub.native.tolist() pvals = df_sub.p_value.tolist() if len(go) > reduce_limit: r = revigo(go, pvals, **revigo_kwds) revs[q] = r r = r.rename(columns={ 'term_ID': 'native' }).drop(columns='description').assign(query=q) dfs.append(df_sub.merge(r)) else: dfs.append(df.assign(eliminated=0)) df = pd.concat(dfs, axis=0).reset_index(drop=True) return df, revs
def main(args): gp = GProfiler( user_agent='gprofiler_custom_gmt', #optional user agent return_dataframe= True, #return pandas dataframe or plain python structures ) genes = [line.strip() for line in open(args.filename)] if args.gmt is not None: with open(args.gmt) as f: response = requests.post( 'https://biit.cs.ut.ee/gprofiler/api/gost/custom/', json={ 'gmt': f.read(), 'name': args.gmt }) token = get_token_form_response(response) elif args.token is not None: token = args.token else: raise ValuError("Please supply either a token or a gmt file") res = gp.profile(genes, organism=token) res.to_csv(args.output)
def make_tcga_gtex_id_mapping_file(tcga_gtex_id_df, tcga_gtex_id_addr): # print(tcga_gtex_id_df) ensembl_id = tcga_gtex_id_df['sample'].str.split(".", n=1, expand=True) tcga_gtex_id_df['ensembl_gene'] = ensembl_id[0] # print(tcga_gtex_id_df) gp = GProfiler(return_dataframe=True) ensembl_2_symbol = gp.convert( organism='hsapiens', query=tcga_gtex_id_df['ensembl_gene'].tolist(), target_namespace='ENSG') # print(ensembl_2_symbol[['incoming','name']]) tcga_gtex_id_df['gene_symbol'] = tcga_gtex_id_df[['ensembl_gene']].merge( ensembl_2_symbol, how='left', right_on='incoming', left_on='ensembl_gene').name # print(tcga_gtex_id_df) tcga_gtex_id_df.to_csv(tcga_gtex_id_addr, sep='\t', index=False)
def gsea_connected_components(G, outdir): """ Perform Gene Set Enrichment Analysis on the connected components in G using GProfiler Returns ------- rv : list of (set, str) tuples of gene set that was queried for enrichment and the enrichment output file """ rv = [] gp = GProfiler("FluPath/0.1") if nx.is_directed(G): G = G.to_undirected() comps = list(nx.connected_components(G)) comp_no = 0 for comp in comps: # TODO how are http errors handled? enrich_out_fp = os.path.join(outdir, "enrich_{}.tsv".format(comp_no)) if not os.path.exists(enrich_out_fp): enrich = gp.gprofile(comp, src_filter=['GO:BP']) write_enrich(enrich, enrich_out_fp) rv.append((comp, enrich_out_fp)) comp_no += 1 return rv
def add_ensembl_gene_into_string_info(string_info_addr): string_info_df = pd.read_csv(string_info_addr, sep='\t') protein_tax_ensembl = string_info_df['protein_external_id'].str.split( ".", n=1, expand=True) string_info_df['protein_ensembl.protein'] = protein_tax_ensembl[1] protein_ensembl = protein_tax_ensembl[1].tolist() print(protein_ensembl) # string_info_df = string_info_df.iloc[:-1] # mg = mygene.MyGeneInfo() # ensembl_protein_to_gene_df = mg.querymany(protein_ensembl, scopes='ensembl.protein', fields='ensembl.gene', # species=10090,returnall=False, as_dataframe=True) # # # ensembl_protein_to_gene_df = mg.getgenes(protein_ensembl, fields='ensembl.gene', # # species=10090,as_dataframe=True) # # print(ensembl_protein_to_gene_df) # # # ensembl_protein_to_gene_df.to_csv("/Users/woochanghwang/PycharmProjects/CIMR/Data/STRING/ensembl.id.tsv",sep='\t') from gprofiler import GProfiler gp = GProfiler(return_dataframe=True) ensembl_protein_to_gene_df = gp.orth(organism='mmusculus', target='ENSG', query=protein_ensembl) ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.reset_index() ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.set_index( 'incoming') # ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.drop('index') # ensembl_protein_to_gene_df.to_csv("/Users/woochanghwang/PycharmProjects/CIMR/Data/STRING/string_ensembl_protein_to_gene_gp.tsv",sep='\t',index=False) # string_info_df = string_info_df.drop_duplicates() # ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.drop_duplicates() string_info_df = string_info_df.set_index('protein_ensembl.protein') print(string_info_df.head()) print(ensembl_protein_to_gene_df.head()) string_info_ensembl_df = pd.concat( [string_info_df, ensembl_protein_to_gene_df], axis=1, sort=False) # string_info_ensembl_df = pd.merge(string_info_df, ensembl_protein_to_gene_df) string_info_ensembl_df.to_csv( "/Users/woochanghwang/PycharmProjects/CIMR/Data/STRING/10090.protein.info.v11.0.ensembl.txt", sep='\t')
for f in files: shutil.move(os.path.join(source, f), destination) # Let me now find the GO anaotation graphs for the proteins that have SaintExpress score>0.5 and BFDR<0.01 ''' This code asks you if you have one or more than one conditions. In case, you have one, it gives you only one conditional horizontal bar graphs. If not, it would compare the bar graphs. To have 2 conditions, you need to run this code 2 times with different outputs! ''' if ask_user == "YES": if number_of_conditions == 1: # Only one condition! getting GO annotation profiles of proteins that have >0.5 saint score and <0.01 BFDR score gp = GProfiler(return_dataframe=True) profiler = gp.profile(organism='hsapiens', query=gene_names) BP_profiler = profiler[profiler["source"] == "GO:BP"] CC_profiler = profiler[profiler["source"] == "GO:CC"] MF_profiler = profiler[profiler["source"] == "GO:MF"] BP_profiled = BP_profiler.sort_values(by=["p_value"]) CC_profiled = CC_profiler.sort_values(by=["p_value"]) MF_profiled = MF_profiler.sort_values(by=["p_value"]) location_BP = BP_profiled["name"].to_list()[0:10] p_BP = BP_profiled["p_value"].to_list()[0:10] logged_p_BP = [] for i in p_BP:
community = g.community_infomap() print("Number of Communities:", len(community)) df = pd.DataFrame({'gene': g.vs['name'], 'community': community.membership}) # Order by size of communities valuec = df['community'].value_counts() biggest = valuec.unique()[0] # Corta el valor con mas cuentas values = valuec[(valuec >= biggest) | (valuec >= 10)] # Values of communities filtered order = values.index.tolist() df = df[df['community'].isin(order)] df = df.set_index('community') df = df.loc[order].reset_index() # print("community id - Number of Genes") # print(valuec) #! Enrichment by GO gp = GProfiler(return_dataframe=True) enrich_communities = pd.DataFrame() print("Community id - Number of nodes") for name, group in df.groupby('community', sort=False): print(name, group.shape[0]) s = gp.profile(organism='hsapiens', query=group.gene.tolist()) s['community'] = name enrich_communities = enrich_communities.append(s) # print(enrich_components) #! OUTPUT enrich_communities.to_csv(oname1, sep="\t", index=False) df.to_csv(oname2, sep="\t", index=False)
class GOEnrichmentTester(): def __init__(self): self.gp = GProfiler("COSSY++/1.5") def getGoTerms(self, genelist): result = [] res = self.gp.gprofile(query=genelist) for i in range(len(res)): pvalue = res[i][2] goid = res[i][8] gocat = res[i][9] goterm = res[i][11] ''' if (gocat =="MF" or gocat == "CC" or gocat == "BP"): result.append({"pvalue":pvalue, "id":goid, "category":gocat, "term":goterm}) ''' result.append({"pvalue":pvalue, "id":goid, "category":gocat, "term":goterm}) return result def readTSV(self, fname): records = [] with open(fname) as reader: headers = [] for line in reader: values = [x.replace("\"","") for x in line.split("\t")] if line.startswith("Gene Symbol"): headers = values continue rec = {headers[i] : values[i] for i in range(len(headers))} records.append(rec) return records def loadCOSMIC(self, fname): self.result = {"somatic":{}, "germline":{}} self.diseaseList = [] records = self.readTSV(fname=fname) for rec in records: geneSymbol = rec["Gene Symbol"] somaticTumors = [x.strip() for x in rec["Tumour Types(Somatic)"].strip().split(",")] germlineTumors = [x.strip() for x in rec["Tumour Types(Germline)"].strip().split(",")] for tumorType in somaticTumors: if tumorType == "": continue; if tumorType not in self.result["somatic"]: self.result["somatic"][tumorType] = [] self.result["somatic"][tumorType].append(geneSymbol) if tumorType not in self.diseaseList: self.diseaseList.append(tumorType) for tumorType in germlineTumors: if tumorType == "": continue; if tumorType not in self.result["germline"]: self.result["germline"][tumorType] = [] self.result["germline"][tumorType].append(geneSymbol) if tumorType not in self.diseaseList: self.diseaseList.append(tumorType) self.makeGOList() def getGenes(self, disease): if disease in self.result["somatic"]: somaticGenes = self.result["somatic"][disease] else: somaticGenes = [] if disease in self.result["germline"]: germlineGenes = self.result["germline"][disease] else: germlineGenes = [] return somaticGenes + germlineGenes def makeGOList(self): self.GOList = {} for tumorType in self.diseaseList: print "." genes = self.getGenes(tumorType) goTerms = self.getGoTerms(genes) goTerms = sorted(goTerms, cmp=self.pvaluecomp) self.GOList[tumorType] = goTerms def writeCOSMICGO(self, fname): with open(fname, "w") as w: json.dump(self.GOList, w, indent=4) def corr(self, genes, disease): inputGO = sorted(self.getGoTerms(genes), cmp=self.pvaluecomp) inputGO_terms = [x["term"] for x in inputGO] answerGO = sorted([x for x in self.GOList[disease] if x["term"] in inputGO_terms], cmp=self.pvaluecomp) answerGO_terms = [x["term"] for x in answerGO] assert(len(inputGO_terms) != len(answerGO_terms)) inputGO_ranks_pair = [(x,inputGO_terms.index(x)) for x in inputGO_terms] answerGO_ranks_pair = [(x,answerGO_terms.index(x)) for x in answerGO_terms] inputGO_ranks = [x[1] for x in sorted(inputGO_ranks_pair, key=itemgetter(0))] answerGO_ranks = [x[1] for x in sorted(answerGO_ranks_pair, key=itemgetter(0))] np.correlate(inputGO_ranks, answerGO_ranks, "same") def pvaluecomp(self, a,b): x = a['pvalue'] y = b['pvalue'] if x > y: return 1 elif x < y: return -1 else: return 0
def lookup_enrichment(gene_set): clean_gene_set = [x for x in gene_set if x is not None] gp = GProfiler("GTEx/wj") enrichment_results = gp.gprofile(clean_gene_set) return enrichment_results
import gprofiler from gprofiler import GProfiler GProfiler? gp = GProfiler(return_dataframe=True) gp.profile(organism='mmusculus', query=genes) genes genes = """ENSMUSG00000076488 ENSMUSG00000065231 ENSMUSG00000079120 ENSMUSG00000047222 ENSMUSG00000097494 ENSMUSG00000064419 ENSMUSG00000095668 ENSMUSG00000059606""".split() gp.profile(organism='mmusculus', query=genes) import requests def mygprofiler(namelist, organism='mmusculus'): """Run gProfiler using POST api with a json query body Returns a pandas DataFrame with the result""" if type(namelist) is not list: namelist = list(namelist) r = requests.post( url='https://biit.cs.ut.ee/gprofiler/api/gost/profile/', json={ 'organism':organism, 'query': namelist, } ) df = pd.DataFrame(r.json()['result']) return df
def __init__(self): self.gp = GProfiler("COSSY++/1.5")
def enrich( container: Iterable[str], *, org: str = "hsapiens", gprofiler_kwargs: Mapping[str, Any] = {}, ) -> pd.DataFrame: """\ Get enrichment for DE results. This is a thin convenience wrapper around the very useful gprofiler_. This method dispatches on the first argument, leading to the following two signatures:: enrich(container, ...) enrich(adata: AnnData, group, key: str, ...) Where:: enrich(adata, group, key, ...) = enrich(adata.uns[key]["names"][group], ...) .. _gprofiler: https://pypi.org/project/gprofiler-official/#description Parameters ---------- container Contains genes you'd like to search. adata AnnData object whose group will be looked for. group The group whose genes should be used for enrichment. key Key in `uns` to find group under. {doc_org} gprofiler_kwargs Keyword arguments to pass to `GProfiler.profile`, see gprofiler_. Returns ------- Dataframe of enrichment results. Examples -------- Using `sc.queries.enrich` on a list of genes: >>> import scanpy as sc >>> sc.queries.enrich(['Klf4', 'Pax5', 'Sox2', 'Nanog'], org="hsapiens") Using `sc.queries.enrich` on an :class:`anndata.AnnData` object: >>> pbmcs = sc.datasets.pbmc68k_reduced() >>> sc.tl.rank_genes_groups(pbmcs, "bulk_labels") >>> sc.queries.enrich(pbmcs, "CD34+") """ try: from gprofiler import GProfiler except ImportError: raise ImportError( "This method requires the `gprofiler-official` module to be installed." ) gprofiler = GProfiler(user_agent="scanpy", return_dataframe=True) gprofiler_kwargs = copy(gprofiler_kwargs) for k in ["organism"]: if gprofiler_kwargs.get(k) is not None: raise ValueError( f"Argument `{k}` should be passed directly through `enrich`, " "not through `gprofiler_kwargs`") return gprofiler.profile(list(container), organism=org, **gprofiler_kwargs)
# -*- coding: utf-8 -*- from gprofiler import GProfiler gp = GProfiler(return_dataframe=True) import numpy as np import scipy as sp import pandas as pd import os os.chdir("/home/conor/Documents/Git_Repositories/MSc_Project") DEG_list = pd.read_csv('data/DEG_list.csv') methyl_genes = pd.read_csv('data/Methylation_genes.csv') upreg = DEG_list[(DEG_list['adj.P.Val'] <= 0.1) & (DEG_list['logFC'] >= 0.1)] upreg = upreg['Gene'].astype(str).tolist() upreg[:] = map(str.strip, upreg) downreg = DEG_list[(DEG_list['adj.P.Val'] <= 0.1) & (DEG_list['logFC'] <= -0.1)] downreg = downreg['Gene'].astype(str).tolist() downreg[:] = map(str.strip, downreg) hyper = methyl_genes[methyl_genes['Methylation'] > 0] hyper = hyper['Gene'].tolist() hypo = methyl_genes[methyl_genes['Methylation'] < 0] hypo = hypo['Gene'].tolist() genelists = {'downreg': downreg, 'upreg': upreg, 'hyper': hyper, 'hypo': hypo} for i in genelists: print("Calculating", i, "enrichment...")