geneid2symbol = {} # Get xlsx filename where data is stored din_xlsx = r"C:\Users\krishna\Downloads\padj_converted.xlsx" ###excel file containing 3 columns: ### gene_symbols (our test data), their respective ESENMBL gene ids, and their p adj values (test_data) # Read data if os.path.isfile(din_xlsx): import xlrd book = xlrd.open_workbook(din_xlsx) pg = book.sheet_by_index(0) for r in range(pg.nrows): symbol, geneid, pval = [pg.cell_value(r, c) for c in range(pg.ncols)] if geneid: geneid2symbol[int(geneid)] = symbol print('READ: {XLSX}'.format(XLSX=din_xlsx)) else: raise RuntimeError('CANNOT READ: {XLSX}'.format(XLSX=fin_xlsx)) ### 5. Run Gene Ontology Enrichment Analysis (GOEA) # 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using. geneids_study = geneid2symbol.keys() goea_results_all = goeaobj.run_study(geneids_study) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] ### to export our analysis results one file with only gene symbols and second file with gene ids goeaobj.wr_xlsx("GO_symbols.xlsx", goea_results_sig, itemid2name=geneid2symbol) goeaobj.wr_xlsx("GO_geneids.xlsx", goea_results_sig)
def pullGOenrichment(inputFile, project): GeneID2nt_hum = genes_NCBI_9606_ProteinCoding.GENEID2NT obo_fname = download_go_basic_obo() fin_gene2go = download_ncbi_associations() obodag = GODag("go-basic.obo") # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # assocation is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos))) print(len(GeneID2nt_hum)) goeaobj = GOEnrichmentStudyNS( GeneID2nt_hum.keys(), # List of human protein-coding genes ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method geneid2symbol = {} with open(inputFile, 'r') as infile: input_genes = csv.reader(infile) for line in input_genes: geneid = line[0] symbol = line[1] if geneid: geneid2symbol[int(geneid)] = symbol infile.close() geneids_study = geneid2symbol.keys() goea_results_all = goeaobj.run_study(geneids_study) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] import collections as cx ctr = cx.Counter([r.NS for r in goea_results_sig]) print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format( TOTAL=len(goea_results_sig), BP=ctr['BP'], # biological_process MF=ctr['MF'], # molecular_function CC=ctr['CC'])) # cellular_component goeaobj.wr_xlsx("Data/go_enrichment" + project + ".csv", goea_results_sig) goeaobj.wr_txt("Data/go_enrichment" + project + ".txt", goea_results_sig)