def updateCons(path, db, motifChrom='chr17'): """update conservation scores in gzipped fixedStep wiggle format""" mcollection = db["hg19"+motifChrom] for infile in glob.glob(os.path.join(path, "*.wigFix.gz")): (wigpath,wigfilename) = os.path.split(infile) chrom = wigfilename.split('.')[0] consName = '_'.join(wigfilename.split('.')[1:-2]) print 'updating', consName #print chrom, tfName, consName with gzip.open(infile) as wigFile: #wig = csv.reader(wigFile,delimiter='\t') bwFile = os.path.join(wigpath,consName+'.bw') if not os.path.isfile(bwFile): countWig.compressFixWig(wigFile, consName, bwFile) stepDict, startDict, valuesDict = countWig.getBinFixStart(bwFile,consName) start = startDict[consName][chrom] arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName) cursor = mcollection.find()#{"tf_name": tfName, #"genomic_region.chr": chrom}) print mcollection.count() #num = 0 #avg = 0 for test in cursor: motifStart, motifEnd = test["genomic_region"]["start"], test["genomic_region"]["end"] #num += 1 #print avg#motifStart, motifEnd, num avg = 0 startlist = [start[i] for i in xrange(len(start)-1) if (motifStart >= start[i] and motifStart < start[i+1]) or (motifEnd >= start[i] and motifEnd < start[i+1])] if motifEnd > start[-1]: startlist.append(start[-1]) for i in xrange(len(startlist)): #if avg != 0: #if motifEnd >= startlist[i]:##cases of partial overlap need to renormalize over two fragments ss = startlist[i] xs, xvals, sums, ll = arrayDict[ss] if motifStart < ss <= motifEnd <= ss+ll-1:##left out, right in if avg == 'NA' and i == len(startlist)-1: avg = 0 avg += countWig.queryHist(xs,xvals, sums, ss, motifEnd)[0] *(motifEnd - ss + 1) /(motifEnd - motifStart + 1) elif ss <= motifStart < motifEnd <= ss+ll-1:##in array avg = countWig.queryHist(xs,xvals, sums, motifStart, motifEnd)[0] elif motifStart < ss and ss+ll-1 < motifEnd:##motif > array if avg == 'NA': avg = 0 avg += countWig.queryHist(xs,xvals, sums, ss, ss+ll-1)[0] * ll /(motifEnd - motifStart + 1) elif ss <= motifStart <= ss+ll-1 < motifEnd:##left in, right out if avg == 'NA' and i == len(startlist)-1: avg = 0 avg += countWig.queryHist(xs,xvals, sums, motifStart, ss+ll-1)[0] *(ss + ll - motifStart) /(motifEnd - motifStart + 1) elif ss+ll-1 < motifStart: if avg == 0: #print 'here-->', motifStart, motifEnd, ss, ll avg = 'NA' elif motifEnd < ss: print "this should not happen-- motifStart < motifEnd < ss " if avg == 0: avg = 'NA' if avg != 'NA': mcollection.update({"_id":test["_id"]},{"$set":{"cons."+consName: avg}}, upsert = True) return 0
def updateFOS(path, db, motifChrom="chr17", dgfCutoff=20, method="Binom"):#, flankWin=35): """calculate fos from discontinuous variableStep wiggle files with two method options: NSD/Binomial test""" mcollection = db["hg19"+motifChrom] print 'updating fos', motifChrom for infile in glob.glob(os.path.join(path,"wgEncodeUwDgf*_cut"+motifChrom)): #(wigpath,wigfile) = os.path.split(infile) #(wigfilename,ext) = os.path.splitext(infile) wigfilename = infile.split(motifChrom)[0] expName = "fos" ctName = wigfilename.split('EncodeUwDgf')[-1].split('Aln')[0] wigFile = open(infile,'rt') #wig = csv.reader(wigFile,delimiter='\t') #bwFile = os.path.join(path,wigfilename+'.bw') #countWig.compressVarWig(wigFile, expName, wigfilename) bwFile = wigfilename+motifChrom+'.bw' if not os.path.isfile(bwFile): countWig.compressVarWig(wigFile, expName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName) arrayDict = defaultdict(list) cursor = mcollection.find({"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}}) #{"tf_name": tfName, # "motif_score":{"$lt":1e-4}, # "motif_genomic_regions_info.chr": motifChrom}) for test in cursor: if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] motifStart = test["genomic_region"]["start"] motifEnd = test["genomic_region"]["end"] flankWin = round((motifEnd - motifStart + 1)*1.75) flankL = max(0, motifStart - flankWin) flankR = motifEnd + flankWin countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] if method == "NSD": try: fos = np.sqrt((count-countCent)/count)-np.sqrt(countCent/count) except ZeroDivisionError: fos = 0 elif method == "Binom": try: fos = min(1 - binom.cdf(countCent,countTotL,float(motifEnd-motifStart)/(motifEnd-flankL)), 1 - binom.cdf(countCent,countTotR,float(motifEnd-motifStart)/(flankR-motifStart))) except ZeroDivisionError: fos = 0 if fos > 0.95 and count-countCent > dgfCutoff:#(flankR-flankL-(motifEnd-motifStart)): mcollection.update({"_id":test["_id"]},{"$set":{"dgf.fos": fos}}, upsert = True) #print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos) return 0
def updateCount(path, tfName): """update count data in discontinuous variableStep wiggle format""" for infile in glob.glob(os.path.join(path,"*.wig")): #(wigpath,wigfilename) = os.path.split(infile) (wigfilename,ext) = os.path.splitext(infile) ##depends on the data type and source expName = "Dnase"##or add an expName parser line ctName = wigfilename.split('EncodeUwDnase')[-1].split('Aln')[0] wigFile = open(infile,'rt') #wig = csv.reader(wigFile,delimiter='\t') countWig.compressVarWig(wigFile, expName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(wigfilename+motifChrom+'.bw',ctName) arrayDict = defaultdict(list) cursor = mcollection.find({"tf_name": tfName}) for test in cursor: motifChrom = test["motif_genomic_regions_info"]["chr"] motifStart = test["motif_genomic_regions_info"]["start"] motifEnd = test["motif_genomic_regions_info"]["end"] if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] count = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0] #print count #mcollection.update({"_id":test["_id"]},{"$set":{expName: count}}, upsert = True) test["ct_info"]["accessibility_score"][expName] = count mcollection.save(test) return 0
def updateCount(path, db, motifChrom='chr17', window=100): """update count data in discontinuous variableStep wiggle format""" mcollection = db["hg19"+motifChrom] for infile in glob.glob(os.path.join(path,"wgEncodeUwDgf*_cut"+motifChrom)): #(wigpath,wigfilename) = os.path.split(infile) #(wigfilename,ext) = os.path.splitext(infile) wigfilename = infile.split(motifChrom)[0] ##depends on the data type and source expName = "dgf"##or add an expName parser line ctName = wigfilename.split('EncodeUwDgf')[-1].split('Aln')[0] wigFile = open(infile,'rt') #wig = csv.reader(wigFile,delimiter='\t') if not os.path.isfile(wigfilename+motifChrom+'.bw'): countWig.compressVarWig(wigFile, expName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(wigfilename+motifChrom+'.bw',ctName) arrayDict = defaultdict(list) cursor = mcollection.find()#{"tf_name": tfName}) for test in cursor: #motifChrom = test["genomic_region"]["chr"] motifStart = test["genomic_region"]["start"] motifEnd = test["genomic_region"]["end"] if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] count = countWig.queryHist(xs, xvals, sums, motifStart-window, motifEnd+window, varWindow=True)[0] #print count mcollection.update({"_id":test["_id"]},{"$set":{expName+"."+ctName: count}}, upsert = True) #test["ct_info"]["accessibility_score"][expName] = count #mcollection.save(test) return 0
def updateFOS(path, tfName, motifChrom, method="NSD", flankWin=35): """calculate fos from discontinuous variableStep wiggle files with two method options: NSD/Binomial test""" for infile in glob.glob(os.path.join(path,"*.wig")): #(wigpath,wigfile) = os.path.split(infile) (wigfilename,ext) = os.path.splitext(infile) expName = "FOS" ctName = wigfilename.split('EncodeUwDgf')[-1].split('Aln')[0] #wigFile = open(infile,'rt') #wig = csv.reader(wigFile,delimiter='\t') #bwFile = os.path.join(path,wigfilename+'.bw') #countWig.compressVarWig(wigFile, expName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(wigfilename+motifChrom+'.bw',ctName) arrayDict = defaultdict(list) cursor = mcollection.find({"tf_name": tfName, "motif_score":{"$lt":1e-4}, "motif_genomic_regions_info.chr": motifChrom}) for test in cursor: if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] motifStart = test["motif_genomic_regions_info"]["start"] motifEnd = test["motif_genomic_regions_info"]["end"] flankL = max(0, motifStart - flankWin) flankR = motifEnd + flankWin countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd)[2] countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR)[2] countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100)[2] if method == "NSD": try: fos = np.sqrt((countTot-countCent)/countTot)-np.sqrt(countCent/countTot) except ZeroDivisionError: fos = 0 elif method == "Binom": try: fos = min(1 - binom.cdf(countCent,countTotL,float(motifEnd-motifStart)/(motifEnd-flankL)), 1 - binom.cdf(countCent,countTotR,float(motifEnd-motifStart)/(flankR-motifStart))) except ZeroDivisionError: fos = 0 #mcollection.update({"_id":test["_id"]},{"$set":{"motif_ct_info.fos": {ctName:{method:fos}}}}, upsert = True) if fos > 0.95 and count-countCent > 18:#(flankR-flankL-(motifEnd-motifStart)): print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos) return 0
def getCons(path, tfName): for infile in glob.glob(os.path.join(path, "*.wigFix")): (wigpath,wigfilename) = os.path.split(infile) chrom = wigfilename.split('.')[0] consName = '_'.join(wigfilename.split('.')[1:3]) #print chrom, tfName, consName with open(infile,'rt') as wigFile: wig = csv.reader(wigFile,delimiter='\t') stepDict, startDict, valuesDict = countWig.getFixStart(wig,consName)#'phyloP30wayEuarchontoglires') start = startDict[consName][chrom] arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName) cursor = mcollection.find({"tf_name": tfName, "motif_genomic_regions_info.chr": chrom}) for test in cursor: motifStart, motifEnd = test["motif_genomic_regions_info"]["start"], test["motif_genomic_regions_info"]["end"] avg = 0 #print motifStart, motifEnd startlist = [start[i] for i in xrange(len(start)-1) if (motifStart > start[i] and motifStart < start[i+1]) or (motifEnd > start[i] and motifEnd < start[i+1])] if len(startlist) > 0: print startlist #print start[-1] startlist.append(start[-1]) for i in xrange(len(startlist)): #print arrayDict[start[i]] # if avg != 0 and motifEnd < start[i]: # break ##fall into range and break out if avg != 0: if motifEnd > startlist[i]:##cases of partial overlap need to renormalize over two fragments xs, xvals, sums = arrayDict[startlist[i]] avg = avg * (startlist[i] - motifStart) + (countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0] * (motifEnd - startlist[i] + 1)) / (motifEnd - motifStart + 1) else: break elif avg == 0: xs, xvals, sums = arrayDict[startlist[i]] avg = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0] if avg > 0: print avg, motifStart, motifEnd mcollection.update({"_id":test["_id"]},{"$set":{"motif_cons_info":{consName: avg}}}, upsert = True) else: mcollection.update({"_id":test["_id"]},{"$set":{"motif_cons_info":{consName: avg}}}, upsert = True) #mcollection.save(test) return 0
def getCount(path, tfName): for infile in glob.glob(os.path.join(path,"*.wig")): (wigpath,wigfilename) = os.path.split(infile) ##depends on the data type and source methodName = "Dnase" ctName = wigfilename.split('EncodeUwDnase')[-1].split('Aln')[0] wigFile = open(infile,'rt') wig = csv.reader(wigFile,delimiter='\t') coordDict, valuesDict = countWig.getCoord(wig,ctName) arrayDict = defaultdict(list) cursor = mcollection.find({"tf_name": tfName}) for test in cursor: motifChrom, motifStart, motifEnd = test["motif_genomic_regions_info"]["chr"], test["motif_genomic_regions_info"]["start"], test["motif_genomic_regions_info"]["end"] if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] count = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0] #print count test["ct_info"]["accessibility_score"][methodName] = count mcollection.save(test) return 0
def updateFPS(infile, outpath, tfname= "CTCF", motifChrom="chr15", ctName="Gm12878", dgfCutoff=36, expName="atacseq"):#, stranded=False): """calculate footprint scores from discontinuous variableStep wiggle files""" #check if directories exists motifdir = os.path.join(outpath,"bedMotifs") fpsdir = os.path.join(outpath,"fpsMotifs") if not os.path.isdir(motifdir): print "Error: path-to-motif-bed-files invalid, please specify a valid outpath to store all calculated scores." sys.exit() #os.mkdir(motifdir) if not os.path.isdir(fpsdir): os.mkdir(fpsdir) print 'updating fps for ', ctName, motifChrom wigfilename = re.split(".wig",infile)[0]#os.path.join(inpath,"SRR8912"+"68"+"sort_cut") gcoordsfile = gzip.open(os.path.join(motifdir,tfname+motifChrom+".bed.gz"),'r') gcoords = csv.reader(gcoordsfile, delimiter='\t') fpsfile = gzip.open(os.path.join(fpsdir,tfname+ctName+motifChrom+'fps.txt.gz'),'w') writer = csv.writer(fpsfile, delimiter='\t') ##non-strand-specific fps for atac-seq if expName=="atacseq": wigFile = open(infile,'rt') bwFile = wigfilename+motifChrom+'.bw' if not os.path.isfile(bwFile): countWig.compressVarWig(wigFile, ctName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName) arrayDict = defaultdict(list) for test in gcoords: if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] motifStart, motifEnd = int(test[1]), int(test[2]) flankWin = round((motifEnd - motifStart + 1)*1.75)#35 flankL = max(0, int(motifStart - flankWin)) flankR = int(motifEnd + flankWin) countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] count = count-countCent if count >= dgfCutoff: acces = 1.0 fragP = array("d") fragN = array("d") for i in xrange(flankL,motifEnd): c = countWig.queryHist(xs, xvals, sums, i, i+1, varWindow=True)[2] fragP.append(c) #Centp = motifStart-flankL countTotL = sum(fragP)#countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] for i in xrange(motifStart,flankR): c = countWig.queryHist(xs, xvals, sums, i, i+1, varWindow=True)[2] fragN.append(c) #Centn = motifEnd+1-motifStart countTotR = sum(fragN)#countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] try: pp = binom.cdf(countCent,countTotL,float(motifEnd+1-motifStart)/(motifEnd+1-flankL)) pn = binom.cdf(countCent,countTotR,float(motifEnd+1-motifStart)/(flankR+1-motifStart)) fos = pp*pn except ZeroDivisionError: fos = 1.0 ##fdr correction fosArray = array("d") for s in xrange(500): random.shuffle(fragP) random.shuffle(fragN) try: pp = binom.cdf(sum(fragP[(motifStart-flankL):]),sum(fragP),float(motifEnd+1-motifStart)/(motifEnd+1-flankL)) pn = binom.cdf(sum(fragN[:(motifEnd+1-motifStart)]),sum(fragN),float(motifEnd+1-motifStart)/(flankR+1-motifStart)) fosArray.append(pp*pn) except ZeroDivisionError: fosArray.append(1.0) fosCutoff = np.sort(fosArray)[4] #round(sum(1 for s in fosArray if s <= a)/500.0,2) <= 0.01: if fos <= fosCutoff: fps = 1.0##profile else: fps = 0.0##no profile #print tf_name+'\t'+motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(count)+'\t'+str(fos)+'\t'+str(fdr) else: fos = 1.0+(1/(count+1.0)) fps = -1.0 fosCutoff = -1.0 acces = 0.0 row = [count,acces,fos,fosCutoff,fps] writer.writerows([row]) #run strand-specific calls for DNase-Seq or DGF else: shortname = os.path.join(os.path.split(wigfilename)[0],re.split("_",os.path.split(wigfilename)[1])[0]) infilep = shortname + "_p.wig"#os.path.join(path,"wgEncodeUwDgf"+ctName+"Aln_p_cut.wig") infilen = shortname + "_n.wig"#os.path.join(path,"wgEncodeUwDgf"+ctName+"Aln_n_cut.wig") wigFilep = open(infilep,'rt') wigFilen = open(infilen,'rt') bwFilep = shortname+'_p'+motifChrom+'.bw' bwFilen = shortname+'_n'+motifChrom+'.bw' if not os.path.isfile(bwFilep): countWig.compressVarWig(wigFilep,expName,shortname+'_p') if not os.path.isfile(bwFilen): countWig.compressVarWig(wigFilen,expName,shortname+'_n') coordDictp, valueDictp = countWig.getBinVarCoord(bwFilep,ctName) coordDictn, valueDictn = countWig.getBinVarCoord(bwFilen,ctName) arrayDictp = defaultdict(list) arrayDictn = defaultdict(list) for test in gcoords: if not motifChrom in arrayDictp: arrayDictp[motifChrom] = countWig.buildVarHist(motifChrom,coordDictp,valueDictp,ctName) if not motifChrom in arrayDictn: arrayDictn[motifChrom] = countWig.buildVarHist(motifChrom,coordDictn,valueDictn,ctName) motifStart,motifEnd = int(test[1]),int(test[2]) flankWin = round((motifEnd - motifStart + 1)*2)#1.75) flankL= max(0, int(motifStart-flankWin)) flankR = int(motifEnd + flankWin) xsp, xvalsp, sumsp = arrayDictp[motifChrom] xsn, xvalsn, sumsn = arrayDictn[motifChrom] countCentp = countWig.queryHist(xsp, xvalsp, sumsp, motifStart, motifEnd)[2] countCentn = countWig.queryHist(xsn, xvalsn, sumsn, motifStart, motifEnd)[2] countp = countWig.queryHist(xsp, xvalsp, sumsp, motifStart-100, motifEnd+100, varWindow=True)[2] countn = countWig.queryHist(xsn, xvalsn, sumsn, motifStart-100, motifEnd+100, varWindow=True)[2] count = countp+countn-countCentp-countCentn if count >= dgfCutoff: acces = 1.0 fragP = array("d") fragN = array("d") for i in xrange(flankL,motifEnd): c = countWig.queryHist(xsp, xvalsp, sumsp, i, i+1, varWindow=True)[2] fragP.append(c) Centp = motifStart-flankL countTotLp = sum(fragP)#countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] #countCentp = sum(fragP[Centp:])#countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] #countp = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] for i in xrange(motifStart,flankR): c = countWig.queryHist(xsn, xvalsn, sumsn, i, i+1, varWindow=True)[2] fragN.append(c) Centn = motifEnd+1-motifStart countTotRn = sum(fragN)#countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] #countCentn = sum(fragN[:Centn])#countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] #countn = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] try: pp = binom.cdf(countCentp,countTotLp,float(motifEnd+1-motifStart)/(motifEnd+1-flankL)) pn = binom.cdf(countCentn,countTotRn,float(motifEnd+1-motifStart)/(flankR+1-motifStart)) fos = pp*pn except ZeroDivisionError: fos = 1.0 ##fdr correction fosArray = array("d") for s in xrange(500): random.shuffle(fragP) random.shuffle(fragN) try: pp = binom.cdf(sum(fragP[Centp:]),sum(fragP),float(motifEnd+1-motifStart)/(motifEnd+1-flankL)) pn = binom.cdf(sum(fragN[:Centn]),sum(fragN),float(motifEnd+1-motifStart)/(flankR+1-motifStart)) fosArray.append(pp*pn) except ZeroDivisionError: fosArray.append(1.0) fosCutoff = np.sort(fosArray)[4] #round(sum(1 for s in fosArray if s <= a)/500.0,2) <= 0.01: if fos <= fosCutoff: fps = 1.0##profile else: fps = 0.0##no profile #print tf_name+'\t'+motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(count)+'\t'+str(fos)+'\t'+str(fdr) else: fos = 1.0+(1/(count+1.0)) fps = -1.0 fosCutoff = -1.0 acces = 0.0 row = [count,acces,fos,fosCutoff,fps] writer.writerows([row]) fpsfile.close() return 0
def updateCons(inpath, tfname, motifChrom, outpath): """update conservation scores in gzipped fixedStep wiggle format""" #check if directories exists motifdir = os.path.join(outpath,"bedMotifs") consdir = os.path.join(outpath,"consMotifs") if not os.path.isdir(motifdir): print "Error: path-to-motif-bed-files invalid, please specify a valid outpath to store all calculated scores." sys.exit() #os.mkdir(motifdir) if not os.path.isdir(consdir): os.mkdir(consdir) gcoordsfile = gzip.open(os.path.join(motifdir,tfname+motifChrom+".bed.gz")) consfile = gzip.open(os.path.join(consdir,tfname+motifChrom+'cons.txt.gz'),'w') writer = csv.writer(consfile, delimiter='\t') l = [] consTypes = ["phastCons100way","phastCons46way","phastCons46way.placental","phastCons46way.primates", "phyloP100way","phyloP46way","phyloP46way.placental","phyloP46way.primate"] for consType in consTypes: infile = os.path.join(inpath, motifChrom+"."+consType+".wigFix.gz") (wigpath,wigfilename) = os.path.split(infile) chrom = wigfilename.split('.')[0] consName = '_'.join(wigfilename.split('.')[1:-2]) print 'updating', consName gcoordsfile.seek(0) gcoords = csv.reader(gcoordsfile, delimiter='\t') with gzip.open(infile) as wigFile: bwFile = os.path.join(wigpath,motifChrom+"."+consName+'.bw') if not os.path.isfile(bwFile): countWig.compressFixWig(wigFile, consName, bwFile) stepDict, startDict, valuesDict = countWig.getBinFixStart(bwFile,consName) start = startDict[consName][chrom] arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName) r = [] for test in gcoords: motifStart, motifEnd = int(test[1]), int(test[2]) #print motifStart, motifEnd avg = 0 startlist = [start[i] for i in xrange(len(start)-1) if (motifStart >= start[i] and motifStart < start[i+1]) or (motifEnd >= start[i] and motifEnd < start[i+1])] if motifEnd > start[-1]: startlist.append(start[-1]) for i in xrange(len(startlist)): #if avg != 0: #if motifEnd >= startlist[i]:##cases of partial overlap need to renormalize over two fragments ss = startlist[i] xs, xvals, sums, ll = arrayDict[ss] if motifStart < ss <= motifEnd <= ss+ll-1:##left out, right in if avg == 'NA' and i == len(startlist)-1: avg = 0 avg += countWig.queryHist(xs,xvals, sums, ss, motifEnd)[0] *(motifEnd - ss + 1) /(motifEnd - motifStart + 1) elif ss <= motifStart < motifEnd <= ss+ll-1:##in array avg = countWig.queryHist(xs,xvals, sums, motifStart, motifEnd)[0] elif motifStart < ss and ss+ll-1 < motifEnd:##motif > array if avg == 'NA': avg = 0 avg += countWig.queryHist(xs,xvals, sums, ss, ss+ll-1)[0] * ll /(motifEnd - motifStart + 1) elif ss <= motifStart <= ss+ll-1 < motifEnd:##left in, right out if avg == 'NA' and i == len(startlist)-1: avg = 0 avg += countWig.queryHist(xs,xvals, sums, motifStart, ss+ll-1)[0] *(ss + ll - motifStart) /(motifEnd - motifStart + 1) elif ss+ll-1 < motifStart: if avg == 0: #print '...', motifStart, motifEnd, ss, ll avg = 'NA' elif motifEnd < ss: print "Error: motifStart < motifEnd < ss " if avg == 0: avg = 'NA' r.append(avg) l.append(tuple(r)) wl = zip(*l) for i in wl: writer.writerows([list(i)]) consfile.close() return 0
def updateFOS(path, db, motifChrom="chr17", dgfCutoff=11, method="Binom", stranded=False):#, flankWin=35): """calculate fos from discontinuous variableStep wiggle files with two method options: NSD/Binomial test""" mcollection = db["hg19"+motifChrom] print 'updating fos', motifChrom expName="fos" for infile in glob.glob(os.path.join(path,"wgEncodeUwDnaseGm12878Aln_cut"+motifChrom)): #(wigpath,wigfile) = os.path.split(infile) #(wigfilename,ext) = os.path.splitext(infile) wigfilename = re.split(motifChrom,infile)[0]#"_._cut",infile)[0] ctName = wigfilename.split('EncodeUwDnase')[-1].split('Aln')[0] if not stranded: wigFile = open(infile,'rt') #wig = csv.reader(wigFile,delimiter='\t') #bwFile = os.path.join(path,wigfilename+'.bw') #countWig.compressVarWig(wigFile, expName, wigfilename) bwFile = wigfilename+motifChrom+'.bw' if not os.path.isfile(bwFile): countWig.compressVarWig(wigFile, expName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName) arrayDict = defaultdict(list) cursor = mcollection.find()#{"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}}) for test in cursor: if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] motifStart = test["genomic_region"]["start"] motifEnd = test["genomic_region"]["end"] flankWin = round((motifEnd - motifStart + 1)*1.75) flankL = max(0, motifStart - flankWin) flankR = motifEnd + flankWin countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] if method == "NSD": try: fos = np.sqrt((count-countCent)/count)-np.sqrt(countCent/count) except ZeroDivisionError: fos = 0 elif method == "Binom": try: fos = min(1 - binom.cdf(countCent,countTotL,float(motifEnd-motifStart)/(motifEnd-flankL)), 1 - binom.cdf(countCent,countTotR,float(motifEnd-motifStart)/(flankR-motifStart))) except ZeroDivisionError: fos = 0 if fos > 0.95 and count-countCent > dgfCutoff:#(flankR-flankL-(motifEnd-motifStart)): mcollection.update({"_id":test["_id"]},{"$set":{"dnase.fos": fos}}, upsert = True) #print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos) else: infilep = os.path.join(path,"wgEncodeUwDnaseGm12878Aln_p_cut.wig") infilen = os.path.join(path,"wgEncodeUwDnaseGm12878Aln_n_cut.wig") wigFilep = open(infilep,'rt') wigFilen = open(infilen,'rt') bwFilep = wigfilename+'_p_cut'+motifChrom+'.bw' bwFilen = wigfilename+'_n_cut'+motifChrom+'.bw' if not os.path.isfile(bwFilep): countWig.compressVarWig(wigFilep,expName,wigfilename+'_p_cut') if not os.path.isfile(bwFilen): countWig.compressVarWig(wigFilen,expName,wigfilename+'_n_cut') coordDictp, valueDictp = countWig.getBinVarCoord(bwFilep,ctName) coordDictn, valueDictn = countWig.getBinVarCoord(bwFilen,ctName) arrayDictp = defaultdict(list) arrayDictn = defaultdict(list) cursor = mcollection.find() for test in cursor: if not motifChrom in arrayDictp: arrayDictp[motifChrom] = countWig.buildVarHist(motifChrom,coordDictp,valueDictp,ctName) if not motifChrom in arrayDictn: arrayDictn[motifChrom] = countWig.buildVarHist(motifChrom,coordDictn,valueDictn,ctName) xs, xvals, sums = arrayDictp[motifChrom] motifStart,motifEnd = test["genomic_region"]["start"],test["genomic_region"]["end"] flankWin = round((motifEnd - motifStart + 1)*1.75) flankL= max(0, motifStart-flankWin) flankR = motifEnd + flankWin countTotLp = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] countCentp = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] countp = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] xs, xvals, sums = arrayDictn[motifChrom] countTotRn = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] countCentn = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] countn = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] if method == "Binom": try: pp = 1 - binom.cdf(countCentp,countTotLp,float(motifEnd-motifStart)/(motifEnd-flankL)) pn = 1 - binom.cdf(countCentn,countTotRn,float(motifEnd-motifStart)/(flankR-motifStart)) fos = pp*pn except ZeroDivisionError: fos = 0 if fos > 0.95 and countp+countn-countCentp-countCentn > dgfCutoff: mcollection.update({"_id":test["_id"]},{"$set":{"dnase.fosStrand": fos}}, upsert = True) print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos) return 0