def main(BgShelve, BgFile, BgGeneIDCol, BgTermCol, BgStartRow1, FgFile, FgGeneIDCol, FgTermCol, FgStartRow1, recomputeBg): global termSeperator if not recomputeBg: saved = shelve.open(BgShelve) if not saved.has_key("BG"): recomputeBg = True else: BGData = saved["BG"] if len(BGData) < 1 or not BGData.has_key("__$$$NGENES"): recomputeBg = True saved.close() if recomputeBg: print >> stderr, "recal BG" BGData = dict() #=shelve.open(BgShelve); calTerms(BGData, BgFile, BgGeneIDCol, BgTermCol, BgStartRow1) saved = shelve.open(BgShelve) saved["BG"] = BGData saved.close() FGData = dict() calTerms(FGData, FgFile, FgGeneIDCol, FgTermCol, FgStartRow1) #now calculate GO enrichment and FDR FGTerms = FGData.keys() nTermsBG = BGData["__$$$NTERMS"] nGenesBG = BGData["__$$$NGENES"] nTermsFG = FGData["__$$$NTERMS"] nGenesFG = FGData["__$$$NGENES"] pvalueTermMap = dict() for term in FGTerms: if "__$$$" in term: continue if not BGData.has_key(term): print >> stderr, "Error: term", term, "not found in background" BGTermEntry = BGData[term] FGTermEntry = FGData[term] pop = nGenesBG popt = len(BGTermEntry) sam = nGenesFG samt = len(FGTermEntry) pvalue = pvalue_enrichment(pop, popt, sam, samt) if not pvalueTermMap.has_key(pvalue): pvalueTermMap[pvalue] = [] resultVector = [term, pop, popt, sam, samt, FGTermEntry] pvalueTermMap[pvalue].append(resultVector) print >> stderr, resultVector, ",pvalue=", pvalue #now sort pvalue and calculate FDR pvalues = sorted(pvalueTermMap.keys()) nInc = 0 print >> stdout, "FDR", "\t", print >> stdout, "p-value", "\t", print >> stdout, "term", "\t", print >> stdout, "popt", "\t", print >> stdout, "pop", "\t", print >> stdout, "popt/pop", "\t", print >> stdout, "samt", "\t", print >> stdout, "sam", "\t", print >> stdout, "samt/sam", "\t", print >> stdout, "genes", "\t" for pvalue in pvalues: termsWithThisPvalue = pvalueTermMap[pvalue] nTermsWithThisPvalue = len(termsWithThisPvalue) nInc += nTermsWithThisPvalue FDR = (pvalue * nTermsFG) / nInc #now print; for termEntry in termsWithThisPvalue: term, pop, popt, sam, samt, FGTermEntry = termEntry print >> stdout, FDR, "\t", print >> stdout, pvalue, "\t", print >> stdout, term, "\t", print >> stdout, popt, "\t", print >> stdout, pop, "\t", print >> stdout, float(popt) / pop, "\t", print >> stdout, samt, "\t", print >> stdout, sam, "\t", print >> stdout, float(samt) / sam, "\t", print >> stdout, ",".join(FGTermEntry), "\t"
def main(BgShelve, BgFile, BgGeneIDCol, BgTermCol, BgStartRow1, FgFile, FgGeneIDCol, FgTermCol,FgStartRow1,recomputeBg): global termSeperator; if not recomputeBg: saved=shelve.open(BgShelve); if not saved.has_key("BG"): recomputeBg=True; else: BGData=saved["BG"]; if len(BGData)<1 or not BGData.has_key("__$$$NGENES"): recomputeBg=True; saved.close(); if recomputeBg: print >> stderr, "recal BG"; BGData=dict();#=shelve.open(BgShelve); calTerms(BGData,BgFile,BgGeneIDCol,BgTermCol,BgStartRow1); saved=shelve.open(BgShelve); saved["BG"]=BGData; saved.close(); FGData=dict(); calTerms(FGData,FgFile,FgGeneIDCol,FgTermCol,FgStartRow1); #now calculate GO enrichment and FDR FGTerms=FGData.keys(); nTermsBG=BGData["__$$$NTERMS"]; nGenesBG=BGData["__$$$NGENES"]; nTermsFG=FGData["__$$$NTERMS"]; nGenesFG=FGData["__$$$NGENES"]; pvalueTermMap=dict(); for term in FGTerms: if "__$$$" in term: continue; if not BGData.has_key(term): print >> stderr, "Error: term",term,"not found in background"; BGTermEntry=BGData[term]; FGTermEntry=FGData[term]; pop=nGenesBG; popt=len(BGTermEntry); sam=nGenesFG; samt=len(FGTermEntry); pvalue=pvalue_enrichment(pop,popt,sam,samt); if not pvalueTermMap.has_key(pvalue): pvalueTermMap[pvalue]=[]; resultVector=[term,pop,popt,sam,samt,FGTermEntry] pvalueTermMap[pvalue].append(resultVector); print >> stderr, resultVector,",pvalue=",pvalue; #now sort pvalue and calculate FDR pvalues=sorted(pvalueTermMap.keys()); nInc=0; print >> stdout, "FDR","\t", print >> stdout, "p-value","\t", print >> stdout, "term","\t", print >> stdout, "popt","\t", print >> stdout, "pop", "\t", print >> stdout, "popt/pop","\t", print >> stdout, "samt","\t", print >> stdout, "sam","\t", print >> stdout, "samt/sam", "\t", print >> stdout, "genes","\t"; for pvalue in pvalues: termsWithThisPvalue=pvalueTermMap[pvalue]; nTermsWithThisPvalue=len(termsWithThisPvalue); nInc+=nTermsWithThisPvalue; FDR=(pvalue*nTermsFG)/nInc; #now print; for termEntry in termsWithThisPvalue: term,pop,popt,sam,samt,FGTermEntry=termEntry; print >> stdout, FDR,"\t", print >> stdout, pvalue,"\t", print >> stdout, term,"\t", print >> stdout, popt,"\t", print >> stdout, pop, "\t", print >> stdout, float(popt)/pop,"\t", print >> stdout, samt,"\t", print >> stdout, sam,"\t", print >> stdout, float(samt)/sam, "\t", print >> stdout, ",".join(FGTermEntry),"\t";
#!/usr/bin/python from hypergeom import pvalue_enrichment; for i in range(1,1000): r=pvalue_enrichment(19507150 , 11324 , 22457 , 17); print r;