print('{N} of {M:,} results were significant'.format(N=len(goea_quiet_sig), M=len(goea_quiet_all))) print('Significant results: {E} enriched, {P} purified'.format( E=sum(1 for r in goea_quiet_sig if r.enrichment == 'e'), P=sum(1 for r in goea_quiet_sig if r.enrichment == 'p'))) ctr = cx.Counter([r.NS for r in goea_quiet_sig]) print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format( TOTAL=len(goea_quiet_sig), BP=ctr['BP'], # biological_process MF=ctr['MF'], # molecular_function CC=ctr['CC'])) # cellular_component #goeaobj.wr_xlsx("CDK1_test.xlsx", goea_quiet_sig) goeaobj.wr_txt("CDK1_test.txt", goea_quiet_sig) goid_subset = [ 'GO:0003723', # MF D04 RNA binding (32 genes) 'GO:0044822', # MF D05 poly(A) RNA binding (86 genes) 'GO:0003729', # MF D06 mRNA binding (11 genes) 'GO:0019843', # MF D05 rRNA binding (6 genes) 'GO:0003746', # MF D06 translation elongation factor activity (5 genes) ] plot_gos( "nbt3102_MF_RNA_genecnt.png", goid_subset, # Source GO ids obodag, goea_results=goea_quiet_all) # Use pvals for coloring
def pullGOenrichment(inputFile, project): GeneID2nt_hum = genes_NCBI_9606_ProteinCoding.GENEID2NT obo_fname = download_go_basic_obo() fin_gene2go = download_ncbi_associations() obodag = GODag("go-basic.obo") # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # assocation is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos))) print(len(GeneID2nt_hum)) goeaobj = GOEnrichmentStudyNS( GeneID2nt_hum.keys(), # List of human protein-coding genes ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method geneid2symbol = {} with open(inputFile, 'r') as infile: input_genes = csv.reader(infile) for line in input_genes: geneid = line[0] symbol = line[1] if geneid: geneid2symbol[int(geneid)] = symbol infile.close() geneids_study = geneid2symbol.keys() goea_results_all = goeaobj.run_study(geneids_study) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] import collections as cx ctr = cx.Counter([r.NS for r in goea_results_sig]) print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format( TOTAL=len(goea_results_sig), BP=ctr['BP'], # biological_process MF=ctr['MF'], # molecular_function CC=ctr['CC'])) # cellular_component goeaobj.wr_xlsx("Data/go_enrichment" + project + ".csv", goea_results_sig) goeaobj.wr_txt("Data/go_enrichment" + project + ".txt", goea_results_sig)