def run_go_enrichment(strain, genes_of_interest, significant=True, cutoff=0.05, use_parent_terms=True): # Load GO term association dictionary with open(os.path.join('data', 'go_association.pickle'), 'rb') as handle: go_association = pickle.load(handle) background_genes = get_genes( os.path.join('data', strain + '_all_genes.csv')) obo_go_fname = download_go_basic_obo() obo_dag = GODag('go-basic.obo') if strain == 'PA14': genes_of_interest = map_pa14_genes(genes_of_interest) background_genes = map_pa14_genes(background_genes) goea_obj = GOEnrichmentStudyNS(background_genes, go_association, obo_dag, propagate_counts=use_parent_terms, alpha=cutoff, methods=['fdr_bh']) goea_results = goea_obj.run_study(genes_of_interest) if significant is True: goea_results = [ result for result in goea_results if result.p_fdr_bh < cutoff ] enrichment_results = get_enrichment_results(goea_results) return [enrichment_results, goea_results]
def gene_set_query(genes, fdr_threshold=0.10, return_header=False, species='mouse'): """ Runs a GO enrichment analysis query using goatools. The GO dataset here is for mouse, but it might apply to human as well. """ ns2assoc, ids_to_symbols, symbols_to_ids, genes_list = get_species_genes( species) goeaobj = GOEnrichmentStudyNS( genes_list, # List of mouse protein-coding genes ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=fdr_threshold, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method if species == 'mouse' or species == 'mus_musculus': genes = [x.capitalize() for x in genes] else: genes = [x.upper() for x in genes] gene_ids = [symbols_to_ids[x] for x in genes if x in symbols_to_ids] print('gene_ids:', gene_ids) results = goeaobj.run_study(gene_ids) results_sig = [r for r in results if r.p_fdr_bh < fdr_threshold] results_table = [] for r in results_sig: results_table.append([ r.goterm.id, r.goterm.name, r.p_fdr_bh, [ids_to_symbols[gene_id] for gene_id in r.study_items] ]) print(results_table) results_table.sort(key=lambda x: x[2]) if return_header: results_table = [['GO ID', 'Name', 'FDR', 'Overlapping Genes'] ] + results_table print('GO results_table:', results_table) return results_table
def GOEA(genes, objanno): """ returns go term enrichment Keyword arguments: genes -- list of genes objanno -- background dict performs GO term enrichment """ goeaobj = GOEnrichmentStudyNS( objanno.get_id2gos().keys(), # List of mouse protein-coding genes objanno.get_ns2assc(), # geneid/GO associations godag, # Ontologies propagate_counts=True, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method goea_quiet_all = goeaobj.run_study(genes, prt=None) goea_results = dict((el, []) for el in ontologies) for r in goea_quiet_all: goea_results[r.NS].append([r.GO, r.p_fdr_bh]) for ont in goea_results: goea_results[ont] = np.array(goea_results[ont]) goea_results[ont] = goea_results[ont][goea_results[ont][:, 0].argsort()] return goea_results
symbols = np.zeros(len(GeneID2nt_homo.keys()), dtype='U100') geneids = np.zeros(len(GeneID2nt_homo.keys()), dtype=int) #Creating a lookup table to convert the gene symbols to the gene ids needed for the gene enrichment analysis for idx, key in enumerate(GeneID2nt_homo.keys()): symbols[idx] = GeneID2nt_homo[key].Symbol geneids[idx] = GeneID2nt_homo[key].GeneID boolean_symbol = np.isin(symbols, CDK1_gene_list) matches_idx = np.where(boolean_symbol)[0] geneids_matches = list(geneids[matches_idx]) goea_quiet_all = goeaobj.run_study(geneids_matches, prt=None) goea_quiet_sig = [r for r in goea_quiet_all if r.p_fdr_bh < 0.05] print('{N} of {M:,} results were significant'.format(N=len(goea_quiet_sig), M=len(goea_quiet_all))) print('Significant results: {E} enriched, {P} purified'.format( E=sum(1 for r in goea_quiet_sig if r.enrichment == 'e'), P=sum(1 for r in goea_quiet_sig if r.enrichment == 'p'))) ctr = cx.Counter([r.NS for r in goea_quiet_sig]) print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format( TOTAL=len(goea_quiet_sig), BP=ctr['BP'], # biological_process MF=ctr['MF'], # molecular_function CC=ctr['CC'])) # cellular_component
geneid2symbol = {} # Get xlsx filename where data is stored din_xlsx = r"C:\Users\krishna\Downloads\padj_converted.xlsx" ###excel file containing 3 columns: ### gene_symbols (our test data), their respective ESENMBL gene ids, and their p adj values (test_data) # Read data if os.path.isfile(din_xlsx): import xlrd book = xlrd.open_workbook(din_xlsx) pg = book.sheet_by_index(0) for r in range(pg.nrows): symbol, geneid, pval = [pg.cell_value(r, c) for c in range(pg.ncols)] if geneid: geneid2symbol[int(geneid)] = symbol print('READ: {XLSX}'.format(XLSX=din_xlsx)) else: raise RuntimeError('CANNOT READ: {XLSX}'.format(XLSX=fin_xlsx)) ### 5. Run Gene Ontology Enrichment Analysis (GOEA) # 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using. geneids_study = geneid2symbol.keys() goea_results_all = goeaobj.run_study(geneids_study) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] ### to export our analysis results one file with only gene symbols and second file with gene ids goeaobj.wr_xlsx("GO_symbols.xlsx", goea_results_sig, itemid2name=geneid2symbol) goeaobj.wr_xlsx("GO_geneids.xlsx", goea_results_sig)
class Pose(object): def __init__(self, data_dir: str, device='cpu'): # load pretrained model self.model, self.name = self.__pretrained_model_construction__() self.model.load_state_dict( torch.load(data_dir + self.name + '-model.pt')) self.device = device self.__GO_enrich__() def __GO_enrich__(self): go_file = "go-basic.obo" if not os.path.exists(go_file): download_go_basic_obo() # Load gene ontologies obodag = GODag("go-basic.obo") # Read NCBI's gene2go. Store annotations in a list of namedtuples fin_gene2go = download_ncbi_associations() objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # association is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() self.goeaobj = GOEnrichmentStudyNS( GeneID2nt_hum.keys(), # List of human protein-acoding genes ns2assoc, # geneID/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # default multipletest correction method def __pretrained_model_construction__(self): nhids_gcn = [64, 32, 32] prot_out_dim = sum(nhids_gcn) drug_dim = 128 pp = PP(gdata.n_prot, nhids_gcn) pd = PD(prot_out_dim, drug_dim, gdata.n_drug) mip = MultiInnerProductDecoder(drug_dim + pd.d_dim_feat, gdata.n_et) name = 'poly-' + str(nhids_gcn) + '-' + str(drug_dim) return Model(pp, pd, mip).to('cpu'), name def get_prediction_train(self, threshold=0.5): train_idx, train_et = remove_bidirection(gdata.train_idx, gdata.train_et) return self.predict(train_idx[0].tolist(), train_idx[1].tolist(), train_et.tolist(), threshold=threshold) def get_prediction_test(self, threshold=0.5): test_idx, test_et = remove_bidirection(gdata.test_idx, gdata.test_et) return self.predict(test_idx[0].tolist(), test_idx[1].tolist(), test_et.tolist(), threshold=threshold) def predict(self, drug1, drug2, side_effect, threshold=0.5): device = self.device data = gdata.to(device) model = self.model.to(device) model.eval() pp_static_edge_weights = torch.ones( (data.pp_index.shape[1])).to(device) pd_static_edge_weights = torch.ones( (data.pd_index.shape[1])).to(device) z = model.pp(data.p_feat, data.pp_index, pp_static_edge_weights) z0 = z.clone() z1 = z.clone() # prediction based on all infor z = model.pd(z, data.pd_index, pd_static_edge_weights) P = torch.sigmoid((z[drug1] * z[drug2] * model.mip.weight[side_effect]).sum(dim=1)).to('cpu') index_filter = P > threshold drug1 = torch.Tensor(drug1)[index_filter].numpy().astype(int).tolist() if not drug1: raise ValueError( "No Satisfied Edges." + "\n - Suggestion: reduce the threshold probability." + "Current probability threshold is {}. ".format(threshold) + "\n - Please use -h for help") drug2 = torch.Tensor(drug2)[index_filter].numpy().astype(int).tolist() side_effect = torch.Tensor(side_effect)[index_filter].numpy().astype( int).tolist() # prediction based on protein info and their interactions z0.data[:, 64:] *= 0 z0 = model.pd(z0, data.pd_index, pd_static_edge_weights) P0 = torch.sigmoid( (z0[drug1] * z0[drug2] * model.mip.weight[side_effect]).sum(dim=1)).to("cpu") ppiu_score = (P[index_filter] - P0) / P[index_filter] # prediction based on drug info only z1.data *= 0 z1 = model.pd(z1, data.pd_index, pd_static_edge_weights) P1 = torch.sigmoid( (z1[drug1] * z1[drug2] * model.mip.weight[side_effect]).sum(dim=1)).to("cpu") piu_score = (P[index_filter] - P1) / P[index_filter] # reture a query object query = PoseQuery(drug1, drug2, side_effect) query.set_pred_result(P[index_filter].tolist(), piu_score.tolist(), ppiu_score.tolist()) return query def explain_list(self, drug_list_1, drug_list_2, side_effect_list, regulization=2, if_auto_tuning=True, if_pred=True): if if_pred: query = self.predict(drug_list_1, drug_list_2, side_effect_list) else: query = PoseQuery(drug_list_1, drug_list_2, side_effect_list, regulization) return self.explain_query(query, if_auto_tuning=if_auto_tuning, regulization=query.regulization) def explain_query(self, query, if_auto_tuning=True, regulization=2): query.regulization = regulization pp_left_index, pp_left_weight, pd_left_index, pd_left_weight = self.__explain( query) if if_auto_tuning: while pp_left_index.shape[1] == 0: if query.regulization < 0.0001: print("Warning: auto tuning forced to stop.") break query.regulization /= 2 pp_left_index, pp_left_weight, pd_left_index, pd_left_weight = self.__explain( query) query.set_exp_result(pp_left_index, pp_left_weight, pd_left_index, pd_left_weight) goea_results_sig = self.enrich_go(pp_left_index) query.set_enrich_result(goea_results_sig) return query def enrich_go(self, pp_left_index): # -------------- Go Enrichment -------------- geneids_study = pp_left_index.flatten() # geneid2symbol.keys() geneids_study = [ int(gdata.prot_idx_to_id[idx].replace('GeneID', '')) for idx in geneids_study ] goea_results_all = self.goeaobj.run_study(geneids_study) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] return goea_results_sig def __explain(self, query): data = gdata model = self.model device = self.device drug_list_1, drug_list_2, side_effect_list, regulization = query.get_query( ) pre_mask = Pre_mask(data.pp_index.shape[1] // 2, data.pd_index.shape[1]).to(device) data = data.to(device) model = model.to(device) for gcn in self.model.pp.conv_list: gcn.cached = False self.model.pd.conv.cached = False self.model.eval() # pp_static_edge_weights = torch.ones((data.pp_index.shape[1])).to(device) # pd_static_edge_weights = torch.ones((data.pd_index.shape[1])).to(device) optimizer = torch.optim.Adam(pre_mask.parameters(), lr=0.01) fake_optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # z = model.pp(data.p_feat, data.pp_index, pp_static_edge_weights) # z = model.pd(z, data.pd_index, pd_static_edge_weights) # # P = torch.sigmoid((z[drug1, :] * z[drug2, :] * model.mip.weight[side_effect, :]).sum()) # P = torch.sigmoid((z[drug_list_1] * z[drug_list_2] * model.mip.weight[side_effect_list]).sum(dim=1)) # if len(drug_list_1) < 5: # print(P.tolist()) tmp = 0.0 pre_mask.reset_parameters() for i in range(9999): model.train() pre_mask.desaturate() optimizer.zero_grad() fake_optimizer.zero_grad() # half_mask = torch.sigmoid(pre_mask.pp_weight) half_mask = torch.nn.Hardtanh(0, 1)(pre_mask.pp_weight) pp_mask = torch.cat([half_mask, half_mask]) pd_mask = torch.nn.Hardtanh(0, 1)(pre_mask.pd_weight) z = model.pp(data.p_feat, data.pp_index, pp_mask) # TODO: # z = model.pd(z, data.pd_index, pd_static_edge_weights) z = model.pd(z, data.pd_index, pd_mask) # TODO: # P = torch.sigmoid((z[drug1, :] * z[drug2, :] * model.mip.weight[side_effect, :]).sum()) P = torch.sigmoid((z[drug_list_1] * z[drug_list_2] * model.mip.weight[side_effect_list]).sum(dim=1)) EPS = 1e-7 # TODO: loss = torch.log(1 - P + EPS).sum() / regulization \ + 0.5 * (pp_mask * (2 - pp_mask)).sum() \ + (pd_mask * (2 - pd_mask)).sum() # loss = - torch.log(P) + 0.5 * (pp_mask * (2 - pp_mask)).sum() + (pd_mask * (2 - pd_mask)).sum() # TODO: loss.backward() optimizer.step() # print("Epoch:{}, loss:{}, prob:{}, pp_link_sum:{}, pd_link_sum:{}".format(i, loss.tolist(), P.tolist(), pp_mask.sum().tolist(), pd_mask.sum().tolist())) if i % 100 == 0: print( "Epoch:{:3d}, loss:{:0.2f}, prob:{:0.2f}, pp_link_sum:{:0.2f}, pd_link_sum:{:0.2f}" .format(i, loss.tolist(), P.mean().tolist(), pp_mask.sum().tolist(), pd_mask.sum().tolist())) # until no weight need to be updated --> no sum of weights changes if tmp == (pp_mask.sum().tolist(), pd_mask.sum().tolist()): break else: tmp = (pp_mask.sum().tolist(), pd_mask.sum().tolist()) pre_mask.saturate() pp_left_mask = (pp_mask > 0.2).detach().cpu().numpy() tmp = (data.pp_index[0, :] > data.pp_index[1, :]).detach().cpu().numpy() pp_left_mask = np.logical_and(pp_left_mask, tmp) pd_left_mask = (pd_mask > 0.2).detach().cpu().numpy() pp_left_index = data.pp_index[:, pp_left_mask].cpu().numpy() pd_left_index = data.pd_index[:, pd_left_mask].cpu().numpy() pp_left_weight = pp_mask[pp_left_mask].detach().cpu().numpy() pd_left_weight = pd_mask[pd_left_mask].detach().cpu().numpy() return pp_left_index, pp_left_weight, pd_left_index, pd_left_weight
def go_enrichment(gene_list, taxid=9606, background_chrom=None, background_genes=None, terms=None, list_study_genes=False, alpha=0.05): if type(gene_list) is pd.core.series.Series: gene_list = gene_list.tolist() if type(terms) is pd.core.series.Series: terms = terms.tolist() _assert_entrez_email() gene_list = list(gene_list) taxid = _tidy_taxid(taxid) ncbi_tsv = f'geneinfo_cache/{taxid}_protein_genes.txt' if not os.path.exists(ncbi_tsv): fetch_background_genes(taxid) with open(os.devnull, 'w') as null, redirect_stdout(null): obo_fname = download_and_move_go_basic_obo(prt=null) file_gene2go = download_ncbi_associations(prt=null) obodag = GODag("geneinfo_cache/go-basic.obo", optional_attrs=['relationship', 'def'], prt=null) # read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(file_gene2go, taxids=[taxid]) # get associations for each branch of the GO DAG (BP, MF, CC) ns2assoc = objanno.get_ns2assc() # limit go dag to a sub graph including only specified terms and their children if terms is not None: sub_obo_name = 'geneinfo_cache/' + str( hash(''.join(sorted(terms)).encode())) + '.obo' wrsobo = WrSubObo(obo_fname, optional_attrs=['relationship', 'def']) wrsobo.wrobo(sub_obo_name, terms) obodag = GODag(sub_obo_name, optional_attrs=['relationship', 'def'], prt=null) # load background gene set of all genes background_genes_file = f'geneinfo_cache/{taxid}_protein_genes.txt' if not os.path.exists(background_genes_file): fetch_background_genes(taxid) # # load any custum subset if background_genes: if not all(type(x) is int for x in background_genes): if all(x.isnumeric() for x in background_genes): background_genes = list(map(str, background_genes)) else: background_genes = _cached_symbol2ncbi(background_genes, taxid=taxid) df = pd.read_csv(background_genes_file, sep='\t') no_suffix = os.path.splitext(background_genes_file)[0] background_genes_file = f'{no_suffix}_{hash("".join(map(str, sorted(background_genes))))}.txt' df.loc[df.GeneID.isin(background_genes)].to_csv( background_genes_file, sep='\t', index=False) # limit background gene set if background_chrom is not None: df = pd.read_csv(background_genes_file, sep='\t') background_genes_file = f'{os.path.splitext(background_genes_file)[0]}_{background_chrom}.txt' df.loc[df.chromosome == background_chrom].to_csv( background_genes_file, sep='\t', index=False) output_py = f'geneinfo_cache/{taxid}_background.py' ncbi_tsv_to_py(background_genes_file, output_py, prt=null) background_genes_name = output_py.replace('.py', '').replace('/', '.') background_genes = importlib.import_module(background_genes_name) importlib.reload(background_genes) GeneID2nt = background_genes.GENEID2NT if not all(type(x) is int for x in gene_list): gene_list = _cached_symbol2ncbi(gene_list, taxid=taxid) goeaobj = GOEnrichmentStudyNS( GeneID2nt, # List of mouse protein-coding genes ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh'], pvalcalc='fisher_scipy_stats') goea_results_all = goeaobj.run_study(gene_list) rows = [] columns = [ 'namespace', 'term_id', 'e/p', 'pval_uncorr', 'p_fdr_bh', 'ratio', 'bg_ratio', 'obj' ] if list_study_genes: columns.append('study_genes') for ntd in goea_results_all: ntd.__class__ = My_GOEnrichemntRecord # Hack. Changes __class__ of all instances... row = [ ntd.NS, ntd.GO, ntd.enrichment, ntd.p_uncorrected, ntd.p_fdr_bh, ntd.ratio_in_study[0] / ntd.ratio_in_study[1], ntd.ratio_in_pop[0] / ntd.ratio_in_pop[1], ntd ] if list_study_genes: row.append(_cached_ncbi2symbol(sorted(ntd.study_items))) rows.append(row) df = (pd.DataFrame().from_records(rows, columns=columns).sort_values( by=['p_fdr_bh', 'ratio']).reset_index(drop=True)) return df.loc[df.p_fdr_bh < alpha]
goeaobj = GOEnrichmentStudyNS( # list of 'population' of genes looked at in total pop = all_genes['ens_gene'].tolist(), # geneid -> GO ID mapping ns2assoc = ns2assoc, # ontology DAG godag = obodag, propagate_counts = False, # multiple testing correction method (fdr_bh is false discovery rate control with Benjamini-Hochberg) methods = ['fdr_bh'], # significance cutoff for method named above alpha = fdr_level_go_term ) goea_results_all = goeaobj.run_study(sig_genes['ens_gene'].tolist()) # write results to text file goeaobj.wr_tsv(snakemake.output.enrichment, goea_results_all) # plot results ensembl_id_to_symbol = dict(zip(all_genes['ens_gene'], all_genes['ext_gene'])) # from first plot output file name, create generic file name to trigger # separate plots for each of the gene ontology name spaces outplot_generic = snakemake.output.plot[0].replace('_BP.','_{NS}.', 1).replace('_CC.','_{NS}.', 1).replace('_MF.', '_{NS}.', 1) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < fdr_level_go_term]
def pullGOenrichment(inputFile, project): GeneID2nt_hum = genes_NCBI_9606_ProteinCoding.GENEID2NT obo_fname = download_go_basic_obo() fin_gene2go = download_ncbi_associations() obodag = GODag("go-basic.obo") # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # assocation is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos))) print(len(GeneID2nt_hum)) goeaobj = GOEnrichmentStudyNS( GeneID2nt_hum.keys(), # List of human protein-coding genes ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method geneid2symbol = {} with open(inputFile, 'r') as infile: input_genes = csv.reader(infile) for line in input_genes: geneid = line[0] symbol = line[1] if geneid: geneid2symbol[int(geneid)] = symbol infile.close() geneids_study = geneid2symbol.keys() goea_results_all = goeaobj.run_study(geneids_study) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] import collections as cx ctr = cx.Counter([r.NS for r in goea_results_sig]) print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format( TOTAL=len(goea_results_sig), BP=ctr['BP'], # biological_process MF=ctr['MF'], # molecular_function CC=ctr['CC'])) # cellular_component goeaobj.wr_xlsx("Data/go_enrichment" + project + ".csv", goea_results_sig) goeaobj.wr_txt("Data/go_enrichment" + project + ".txt", goea_results_sig)
# Select genes in module df_pca_tmp = df_pca.loc[((df_pca[ld1] > x0) & (df_pca[ld1] < x1) & (df_pca[ld2] > y0) & (df_pca[ld2] < y1) & (df_dian[l12d] > cut_radius)), :] if layer == 'DM': genes_flybase = set(df_pca_tmp.index.tolist()) genes_uniprot = set(df_pca_tmp.index.map(dfQ['UniProtKB/Swiss-Prot ID'].dropna().to_dict()).dropna().tolist()) genes = genes_flybase.union(genes_uniprot) elif layer == 'MM': genes_mgi = set(df_pca_tmp.index.map(dfQ['MGI ID'].dropna().to_dict()).to_list()) genes_uniprot = set(df_pca_tmp.index.map(dfQ['UniProtKB/Swiss-Prot ID'].dropna().to_dict()).dropna().to_list()) genes = genes_mgi.union(genes_uniprot) elif layer == 'HS': genes = set(df_pca_tmp.index.map(dfQ['UniProtKB/Swiss-Prot ID'].dropna().to_dict()).dropna().tolist()) # Run Comparison (only keep GO significant and from 'Biological Process') print("> Runnin GOEA test") goea_res = goea.run_study(genes, prt=None) cols2rem = ['method_flds', 'kws', 'study_items', 'pop_items', 'goterm'] # transform goea objs for DataFrame format res = [{k: v for k, v in i.__dict__.items() if k not in cols2rem} for i in goea_res] dfA = pd.DataFrame(res) if len(dfA): # Index: Biological Process, Significant at 0.01, GO tree depth < 10 dfS = dfA.loc[( (dfA['p_fdr_bh'] <= 0.05) & #(dfA['depth'] < 10) & (dfA['NS'] == 'BP')), :] # Redo Index n = len(dfS)
def plot_go_enrichment(coef_df, auc_vals, pheno_dict, args, mode='abs'): obo_fl = os.path.join(args.go_dir, "go-basic.obo") download_go_basic_obo(obo_fl) obodag = GODag(obo_fl) assoc_fl = os.path.join(args.go_dir, "gene2go") download_ncbi_associations(assoc_fl) objanno = Gene2GoReader(assoc_fl, taxids=[9606]) ns2assoc = objanno.get_ns2assc() ncbi_map = {info.Symbol: ncbi_id for ncbi_id, info in GENEID2NT.items()} use_genes = set(coef_df.columns) & set(ncbi_map) bgrd_ids = [ncbi_map[gn] for gn in use_genes] goeaobj = GOEnrichmentStudyNS(bgrd_ids, ns2assoc, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh']) plot_dict = dict() use_gos = set() coef_mat = coef_df.loc[:, [gene in use_genes for gene in coef_df.columns]] if mode == 'bayes': coef_means = coef_mat.groupby(level=0, axis=1).mean() coef_stds = coef_mat.groupby(level=0, axis=1).std() else: coef_mat = coef_mat.groupby(level=0, axis=1).mean() for mtype, coefs in coef_mat.iterrows(): if not isinstance(mtype, RandomType): if mode == 'abs': fgrd_ctf = coefs.abs().quantile(0.95) fgrd_genes = coefs.index[coefs.abs() > fgrd_ctf] use_clr = 3.17 elif mode == 'high': fgrd_ctf = coefs.quantile(0.95) fgrd_genes = coefs.index[coefs > fgrd_ctf] use_clr = 2.03 elif mode == 'low': fgrd_ctf = coefs.quantile(0.05) fgrd_genes = coefs.index[coefs < fgrd_ctf] use_clr = 1.03 elif mode == 'bayes': gene_scrs = coef_means.loc[mtype].abs() - coef_stds.loc[mtype] fgrd_genes = gene_scrs.index[gene_scrs > 0] use_clr = 3.17 else: raise ValueError( "Unrecognized `mode` argument <{}>!".format(mode)) fgrd_ids = [ncbi_map[gn] for gn in fgrd_genes] goea_out = goeaobj.run_study(fgrd_ids, prt=None) plot_dict[mtype] = { rs.name: np.log10(rs.p_fdr_bh) for rs in goea_out if rs.enrichment == 'e' and rs.p_fdr_bh < 0.05 } plot_df = pd.DataFrame(plot_dict, columns=plot_dict.keys()) if plot_df.shape[0] == 0: print("Could not find any enriched GO terms across {} " "subgroupings!".format(plot_df.shape[1])) return None fig, ax = plt.subplots(figsize=(4.7 + plot_df.shape[0] / 2.3, 2 + plot_df.shape[1] / 5.3)) if plot_df.shape[0] > 2: plot_df = plot_df.iloc[dendrogram(linkage(distance.pdist( plot_df.fillna(0.0), metric='cityblock'), method='centroid'), no_plot=True)['leaves']].transpose() else: plot_df = plot_df.transpose() xlabs = [rs_nm for rs_nm in plot_df.columns] ylabs = [ get_fancy_label(tuple(mtype.subtype_iter())[0][1]) for mtype in plot_df.index ] pval_cmap = sns.cubehelix_palette(start=use_clr, rot=0, dark=0, light=1, reverse=True, as_cmap=True) sns.heatmap(plot_df, cmap=pval_cmap, vmin=-5, vmax=0, linewidths=0.23, linecolor='0.73', xticklabels=xlabs, yticklabels=ylabs) ax.set_xticklabels(xlabs, size=15, ha='right', rotation=31) ax.set_yticklabels(ylabs, size=9, ha='right', rotation=0) ax.set_xlim((plot_df.shape[1] / -83, plot_df.shape[1] * 1.009)) ax.set_ylim((plot_df.shape[0] * 1.009, plot_df.shape[0] / -83)) plt.savefig(os.path.join( plot_dir, '__'.join([args.expr_source, args.cohort]), "{}_go-{}-enrichment_{}.svg".format(args.gene, mode, args.classif)), bbox_inches='tight', format='svg') plt.close()
class GOEngine: def __init__( self, work_dir: str = '.', clean_work_dir: bool = False, organism: str = 'human', study_parameters: Dict[str, Union[int, float, str, List, Dict]] = { 'propagate_counts': False, 'alpha': 0.05, 'methods': ['fdr_bh'] } ) -> GOEngine: """A GOEngine that can be used for performing analysis using GOATOOLS Args: work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory. clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True. organism (str, optional): The organism . Defaults to 'human'. study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']} Returns: GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS """ print("Creating a GO Engine ...") if not os.path.exists(work_dir): raise ValueError( f"The provided work path: {work_dir} does not exist!!!") self.work_dir = work_dir if organism != 'human' and organism != 'mouse': raise ValueError( f"The provided organism: {organism} is not support, current engine mainly work with human and moues only" ) print(f"\t --> Downloading data ...") obo_fname = download_go_basic_obo( os.path.join(work_dir, 'go-basic.obo')) gene2go_fname = download_ncbi_associations( os.path.join(work_dir, 'gene2go')) ## parse the GO term print( f"\t --> parsing the data and intializing the base GOEA object...") obo_dag = GODag(obo_fname) if organism == 'human': self._goea_obj = GOEnrichmentStudyNS( gene2iden_human.keys(), Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(), obo_dag, **study_parameters) else: self._goea_obj = GOEnrichmentStudyNS( gene2iden_human.keys(), Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(), obo_dag, **study_parameters) self._clean_work_dir = clean_work_dir self._gene_ids = None return def load_data(self, exp: Experiment, num_proteins: int = -1) -> None: """Load the data to the Engine, so GOEA can be conducted Args: exp (Experiment): An Experimental object to extract uniprot ids num_proteins (int, optional): The number of proteins to be included in the analysis. Defaults -1 to which mean use all proteins,\ otherwise it uses the number of proteins provided by the user. note that the function is sorted by number of peptides per protein,\ that is the first 10 protein means, getting the top 10 protein with most peptides. Raises: ValueError: if the function called while data being already associated with the engine from a previous call """ if self._gene_ids is not None: raise ValueError( f"There some data still in the engine, the first 10 genes are: {','.join(self._gene_ids[:10])}\ clean your engine from previous data using the function, clean_engine and try again." ) print( f"Getting the number of peptide per protein ..., started at: {time.ctime()}" ) num_protein_per_peptides = exp.get_peptides_per_protein() if num_proteins == -1: list_proteins = num_protein_per_peptides.iloc[:, 0].to_list() else: list_proteins = num_protein_per_peptides.iloc[:, 0].to_list( )[:num_proteins] print( f"Map uniprot to Entrez gene ids ..., starting at: {time.ctime()}") self._gene_ids = [ int(gene_id) for gene_id in map_from_uniprot_to_Entrez_Gene( list_proteins).iloc[:, 1].to_list() ] print(f"{len(self._gene_ids)} Genes have been correctly loaded") return def run_analysis( self, quite: bool = False, only_signifcant: bool = True, significance_level: float = 0.05, get_list_term: bool = False ) -> Union[pd.DataFrame, List[GOEnrichmentRecord]]: if quite: goea_results = self._goea_obj.run_study(self._gene_ids, prt=None) else: goea_results = self._goea_obj.run_study(self._gene_ids) if only_signifcant: goea_results = [ res for res in goea_results if res.p_fdr_bh < significance_level ] if get_list_term: return goea_results else: self._goea_obj.wr_tsv(os.path.join(self.work_dir, 'temp_file.tsv'), goea_results) results_df = pd.read_csv(os.path.join(self.work_dir, 'temp_file.tsv'), sep='\t') os.system(f"rm -f {os.path.join(self.work_dir,'temp_file.tsv')}") return results_df def clean_engine(self) -> None: """Remove Current list of gene ids associated with the engine """ self._gene_ids = None return def __del__(self) -> None: """class destructor, clean work directory if clean_work_dir is set to True """ if self.clean_work_dir: os.system(f"rm -f {self.work_dir}/*") return