def add_gene_name_gprofiler(data_df: pd.DataFrame, col: str, organism: str) -> pd.DataFrame: gp = GProfiler(return_dataframe=True) protein_list = data_df[col].tolist() # details of what returns the following function : https://pypi.org/project/gprofiler-official/ # TODO : documentation res = gp.convert(organism=organism, query=protein_list, target_namespace='UNIPROTSWISSPROT') # now add the relevant results to dataframe res_f = res[['incoming', 'name', 'namespaces']] res_f.rename(columns={ "incoming": col, "name": "gene_name", "namespaces": "gene_name_bank" }, inplace=True) res_f = res_f.replace({ 'UNIPROTSWISSPROT,UNIPROT_GN_ACC': 'Swiss-Prot', 'UNIPROTSPTREMBL,UNIPROT_GN_ACC': 'TrEMBL' }) df = data_df.merge(res_f, how='left', on=col) # gProfiler returns one line for each alias of the gene (as in alias section in Uniprot): keep only the first one df = df[~df['Accession'].duplicated(keep='first')] return df
def get_gene_names(geneList): gp = GProfiler(return_dataframe=True) df = gp.convert(organism='athaliana', query=geneList)[['incoming', 'name', 'description']] df['description'] = df.apply( lambda x: x['description'].split('[')[0].split(';')[0], axis=1) return df
def add_gene_name_gprofiler(data_df, col, organism): print(type(data_df[col].tolist())) gp = GProfiler(return_dataframe=True) # details of what returns the following function : caleydo.org/tools/ res = gp.convert(organism=organism, query=data_df[col].tolist(), target_namespace='UNIPROTSWISSPROT') # now add the relevant results to dataframe res_f = res[['incoming', 'name', 'namespaces']] res_f.rename(columns={"incoming":col, "name": "gene_name", "namespaces": "gene_name_bank"}, inplace=True) res_f = res_f.replace({'UNIPROTSWISSPROT,UNIPROT_GN_ACC':'Swiss-Prot', 'UNIPROTSPTREMBL,UNIPROT_GN_ACC':'TrEMBL'}) df = data_df.merge(res_f, how='left', on=col) # TODO check if concordant with description #df['OK'] = np.where(df['gene_name_PD'] == df['converted_gprofiler'], True, False) print(df) return df
def get_gene_list(self, samples_stat): for sample in samples_stat: gene = samples_stat[sample]['gene'] if len(gene) == 0: continue else: gp = GProfiler(user_agent='ExampleTool', return_dataframe=True) df = gp.profile(organism='hsapiens', query=gene) go = df[df['native'].str.contains('GO')] go.to_csv('{module}/{sample}/GO_FuncTerm.csv'.format( module=self.module, sample=sample), header=True, index=False, sep=',') self.plot_go(go, sample, 'GO') kegg = df[df['native'].str.contains('KEGG')] kegg.to_csv('{module}/{sample}/KEGG_FuncTerm.csv'.format( module=self.module, sample=sample), header=True, index=False, sep=',') self.plot_go(kegg, sample, 'KEGG') df = gp.convert(organism='hsapiens', query=gene, target_namespace='ENTREZGENE_ACC') df.to_csv('{module}/{sample}/Entrez_Gene_converted.csv'.format( module=self.module, sample=sample), header=True, index=False, sep=',') with open( '{module}/{sample}/gene_list.txt'.format( module=self.module, sample=sample), 'wt') as f: f.write('\n'.join(gene))
def gene_name_annotation_short(genes): gp = GProfiler(return_dataframe=True) gene_annot = gp.convert(organism='mmusculus', query= genes, target_namespace='ENTREZGENE_ACC') gene_annot['short_description'] = gene_annot['description'].map(lambda x: re.sub('\[.+\]', '', x)) # delete extra text between [] gene_annot = gene_annot.drop(['description','name', 'converted','n_incoming','n_converted', 'namespaces', 'query'], axis=1) return gene_annot
def gene_name_annotation_long(genes): gp = GProfiler(return_dataframe=True) gene_annot = gp.convert(organism='mmusculus', query= genes, target_namespace='ENTREZGENE_ACC') gene_annot['short_description'] = gene_annot['description'].map(lambda x: re.sub('\[.+\]', '', x)) # delete extra text between [] gene_annot = gene_annot.drop(['description','name', 'converted','n_incoming','n_converted', 'namespaces', 'query'], axis=1) with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print all lines return display(gene_annot)
def make_tcga_gtex_id_mapping_file(tcga_gtex_id_df, tcga_gtex_id_addr): # print(tcga_gtex_id_df) ensembl_id = tcga_gtex_id_df['sample'].str.split(".", n=1, expand=True) tcga_gtex_id_df['ensembl_gene'] = ensembl_id[0] # print(tcga_gtex_id_df) gp = GProfiler(return_dataframe=True) ensembl_2_symbol = gp.convert( organism='hsapiens', query=tcga_gtex_id_df['ensembl_gene'].tolist(), target_namespace='ENSG') # print(ensembl_2_symbol[['incoming','name']]) tcga_gtex_id_df['gene_symbol'] = tcga_gtex_id_df[['ensembl_gene']].merge( ensembl_2_symbol, how='left', right_on='incoming', left_on='ensembl_gene').name # print(tcga_gtex_id_df) tcga_gtex_id_df.to_csv(tcga_gtex_id_addr, sep='\t', index=False)
res myres res.columns myres.columns myres.goshv res.pvalue res.pval res res.columns res.p_value myres.p_value res.significant myres.significant gp.profile? myres.columns gp.convert(organism='mmusculus', query=genes) gp.convert(organism='mmusculus', query=genes, target_namespace='name') gp.convert? gp.convert(organism='mmusculus', query=genes, target_namespace='name') gp.convert(organism='mmusculus', query=genes, ) gp.convert(organism='mmusculus', query=genes, ).namespaces gp.convert(organism='mmusculus', query=genes, target_namespace='ENSG' ) r = requests.post( url='https://biit.cs.ut.ee/gprofiler/api/convert/convert/', json={ 'organism':'mmusculus', 'target':'UCSC', 'query':genes, } ) x = r.json()