def updateCount(path, tfName): """update count data in discontinuous variableStep wiggle format""" for infile in glob.glob(os.path.join(path,"*.wig")): #(wigpath,wigfilename) = os.path.split(infile) (wigfilename,ext) = os.path.splitext(infile) ##depends on the data type and source expName = "Dnase"##or add an expName parser line ctName = wigfilename.split('EncodeUwDnase')[-1].split('Aln')[0] wigFile = open(infile,'rt') #wig = csv.reader(wigFile,delimiter='\t') countWig.compressVarWig(wigFile, expName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(wigfilename+motifChrom+'.bw',ctName) arrayDict = defaultdict(list) cursor = mcollection.find({"tf_name": tfName}) for test in cursor: motifChrom = test["motif_genomic_regions_info"]["chr"] motifStart = test["motif_genomic_regions_info"]["start"] motifEnd = test["motif_genomic_regions_info"]["end"] if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] count = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0] #print count #mcollection.update({"_id":test["_id"]},{"$set":{expName: count}}, upsert = True) test["ct_info"]["accessibility_score"][expName] = count mcollection.save(test) return 0
def updateCount(path, db, motifChrom='chr17', window=100): """update count data in discontinuous variableStep wiggle format""" mcollection = db["hg19"+motifChrom] for infile in glob.glob(os.path.join(path,"wgEncodeUwDgf*_cut"+motifChrom)): #(wigpath,wigfilename) = os.path.split(infile) #(wigfilename,ext) = os.path.splitext(infile) wigfilename = infile.split(motifChrom)[0] ##depends on the data type and source expName = "dgf"##or add an expName parser line ctName = wigfilename.split('EncodeUwDgf')[-1].split('Aln')[0] wigFile = open(infile,'rt') #wig = csv.reader(wigFile,delimiter='\t') if not os.path.isfile(wigfilename+motifChrom+'.bw'): countWig.compressVarWig(wigFile, expName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(wigfilename+motifChrom+'.bw',ctName) arrayDict = defaultdict(list) cursor = mcollection.find()#{"tf_name": tfName}) for test in cursor: #motifChrom = test["genomic_region"]["chr"] motifStart = test["genomic_region"]["start"] motifEnd = test["genomic_region"]["end"] if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] count = countWig.queryHist(xs, xvals, sums, motifStart-window, motifEnd+window, varWindow=True)[0] #print count mcollection.update({"_id":test["_id"]},{"$set":{expName+"."+ctName: count}}, upsert = True) #test["ct_info"]["accessibility_score"][expName] = count #mcollection.save(test) return 0
def updateFOS(path, db, motifChrom="chr17", dgfCutoff=20, method="Binom"):#, flankWin=35): """calculate fos from discontinuous variableStep wiggle files with two method options: NSD/Binomial test""" mcollection = db["hg19"+motifChrom] print 'updating fos', motifChrom for infile in glob.glob(os.path.join(path,"wgEncodeUwDgf*_cut"+motifChrom)): #(wigpath,wigfile) = os.path.split(infile) #(wigfilename,ext) = os.path.splitext(infile) wigfilename = infile.split(motifChrom)[0] expName = "fos" ctName = wigfilename.split('EncodeUwDgf')[-1].split('Aln')[0] wigFile = open(infile,'rt') #wig = csv.reader(wigFile,delimiter='\t') #bwFile = os.path.join(path,wigfilename+'.bw') #countWig.compressVarWig(wigFile, expName, wigfilename) bwFile = wigfilename+motifChrom+'.bw' if not os.path.isfile(bwFile): countWig.compressVarWig(wigFile, expName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName) arrayDict = defaultdict(list) cursor = mcollection.find({"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}}) #{"tf_name": tfName, # "motif_score":{"$lt":1e-4}, # "motif_genomic_regions_info.chr": motifChrom}) for test in cursor: if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] motifStart = test["genomic_region"]["start"] motifEnd = test["genomic_region"]["end"] flankWin = round((motifEnd - motifStart + 1)*1.75) flankL = max(0, motifStart - flankWin) flankR = motifEnd + flankWin countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] if method == "NSD": try: fos = np.sqrt((count-countCent)/count)-np.sqrt(countCent/count) except ZeroDivisionError: fos = 0 elif method == "Binom": try: fos = min(1 - binom.cdf(countCent,countTotL,float(motifEnd-motifStart)/(motifEnd-flankL)), 1 - binom.cdf(countCent,countTotR,float(motifEnd-motifStart)/(flankR-motifStart))) except ZeroDivisionError: fos = 0 if fos > 0.95 and count-countCent > dgfCutoff:#(flankR-flankL-(motifEnd-motifStart)): mcollection.update({"_id":test["_id"]},{"$set":{"dgf.fos": fos}}, upsert = True) #print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos) return 0
def updateFPS(infile, outpath, tfname= "CTCF", motifChrom="chr15", ctName="Gm12878", dgfCutoff=36, expName="atacseq"):#, stranded=False): """calculate footprint scores from discontinuous variableStep wiggle files""" #check if directories exists motifdir = os.path.join(outpath,"bedMotifs") fpsdir = os.path.join(outpath,"fpsMotifs") if not os.path.isdir(motifdir): print "Error: path-to-motif-bed-files invalid, please specify a valid outpath to store all calculated scores." sys.exit() #os.mkdir(motifdir) if not os.path.isdir(fpsdir): os.mkdir(fpsdir) print 'updating fps for ', ctName, motifChrom wigfilename = re.split(".wig",infile)[0]#os.path.join(inpath,"SRR8912"+"68"+"sort_cut") gcoordsfile = gzip.open(os.path.join(motifdir,tfname+motifChrom+".bed.gz"),'r') gcoords = csv.reader(gcoordsfile, delimiter='\t') fpsfile = gzip.open(os.path.join(fpsdir,tfname+ctName+motifChrom+'fps.txt.gz'),'w') writer = csv.writer(fpsfile, delimiter='\t') ##non-strand-specific fps for atac-seq if expName=="atacseq": wigFile = open(infile,'rt') bwFile = wigfilename+motifChrom+'.bw' if not os.path.isfile(bwFile): countWig.compressVarWig(wigFile, ctName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName) arrayDict = defaultdict(list) for test in gcoords: if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] motifStart, motifEnd = int(test[1]), int(test[2]) flankWin = round((motifEnd - motifStart + 1)*1.75)#35 flankL = max(0, int(motifStart - flankWin)) flankR = int(motifEnd + flankWin) countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] count = count-countCent if count >= dgfCutoff: acces = 1.0 fragP = array("d") fragN = array("d") for i in xrange(flankL,motifEnd): c = countWig.queryHist(xs, xvals, sums, i, i+1, varWindow=True)[2] fragP.append(c) #Centp = motifStart-flankL countTotL = sum(fragP)#countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] for i in xrange(motifStart,flankR): c = countWig.queryHist(xs, xvals, sums, i, i+1, varWindow=True)[2] fragN.append(c) #Centn = motifEnd+1-motifStart countTotR = sum(fragN)#countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] try: pp = binom.cdf(countCent,countTotL,float(motifEnd+1-motifStart)/(motifEnd+1-flankL)) pn = binom.cdf(countCent,countTotR,float(motifEnd+1-motifStart)/(flankR+1-motifStart)) fos = pp*pn except ZeroDivisionError: fos = 1.0 ##fdr correction fosArray = array("d") for s in xrange(500): random.shuffle(fragP) random.shuffle(fragN) try: pp = binom.cdf(sum(fragP[(motifStart-flankL):]),sum(fragP),float(motifEnd+1-motifStart)/(motifEnd+1-flankL)) pn = binom.cdf(sum(fragN[:(motifEnd+1-motifStart)]),sum(fragN),float(motifEnd+1-motifStart)/(flankR+1-motifStart)) fosArray.append(pp*pn) except ZeroDivisionError: fosArray.append(1.0) fosCutoff = np.sort(fosArray)[4] #round(sum(1 for s in fosArray if s <= a)/500.0,2) <= 0.01: if fos <= fosCutoff: fps = 1.0##profile else: fps = 0.0##no profile #print tf_name+'\t'+motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(count)+'\t'+str(fos)+'\t'+str(fdr) else: fos = 1.0+(1/(count+1.0)) fps = -1.0 fosCutoff = -1.0 acces = 0.0 row = [count,acces,fos,fosCutoff,fps] writer.writerows([row]) #run strand-specific calls for DNase-Seq or DGF else: shortname = os.path.join(os.path.split(wigfilename)[0],re.split("_",os.path.split(wigfilename)[1])[0]) infilep = shortname + "_p.wig"#os.path.join(path,"wgEncodeUwDgf"+ctName+"Aln_p_cut.wig") infilen = shortname + "_n.wig"#os.path.join(path,"wgEncodeUwDgf"+ctName+"Aln_n_cut.wig") wigFilep = open(infilep,'rt') wigFilen = open(infilen,'rt') bwFilep = shortname+'_p'+motifChrom+'.bw' bwFilen = shortname+'_n'+motifChrom+'.bw' if not os.path.isfile(bwFilep): countWig.compressVarWig(wigFilep,expName,shortname+'_p') if not os.path.isfile(bwFilen): countWig.compressVarWig(wigFilen,expName,shortname+'_n') coordDictp, valueDictp = countWig.getBinVarCoord(bwFilep,ctName) coordDictn, valueDictn = countWig.getBinVarCoord(bwFilen,ctName) arrayDictp = defaultdict(list) arrayDictn = defaultdict(list) for test in gcoords: if not motifChrom in arrayDictp: arrayDictp[motifChrom] = countWig.buildVarHist(motifChrom,coordDictp,valueDictp,ctName) if not motifChrom in arrayDictn: arrayDictn[motifChrom] = countWig.buildVarHist(motifChrom,coordDictn,valueDictn,ctName) motifStart,motifEnd = int(test[1]),int(test[2]) flankWin = round((motifEnd - motifStart + 1)*2)#1.75) flankL= max(0, int(motifStart-flankWin)) flankR = int(motifEnd + flankWin) xsp, xvalsp, sumsp = arrayDictp[motifChrom] xsn, xvalsn, sumsn = arrayDictn[motifChrom] countCentp = countWig.queryHist(xsp, xvalsp, sumsp, motifStart, motifEnd)[2] countCentn = countWig.queryHist(xsn, xvalsn, sumsn, motifStart, motifEnd)[2] countp = countWig.queryHist(xsp, xvalsp, sumsp, motifStart-100, motifEnd+100, varWindow=True)[2] countn = countWig.queryHist(xsn, xvalsn, sumsn, motifStart-100, motifEnd+100, varWindow=True)[2] count = countp+countn-countCentp-countCentn if count >= dgfCutoff: acces = 1.0 fragP = array("d") fragN = array("d") for i in xrange(flankL,motifEnd): c = countWig.queryHist(xsp, xvalsp, sumsp, i, i+1, varWindow=True)[2] fragP.append(c) Centp = motifStart-flankL countTotLp = sum(fragP)#countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] #countCentp = sum(fragP[Centp:])#countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] #countp = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] for i in xrange(motifStart,flankR): c = countWig.queryHist(xsn, xvalsn, sumsn, i, i+1, varWindow=True)[2] fragN.append(c) Centn = motifEnd+1-motifStart countTotRn = sum(fragN)#countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] #countCentn = sum(fragN[:Centn])#countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] #countn = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] try: pp = binom.cdf(countCentp,countTotLp,float(motifEnd+1-motifStart)/(motifEnd+1-flankL)) pn = binom.cdf(countCentn,countTotRn,float(motifEnd+1-motifStart)/(flankR+1-motifStart)) fos = pp*pn except ZeroDivisionError: fos = 1.0 ##fdr correction fosArray = array("d") for s in xrange(500): random.shuffle(fragP) random.shuffle(fragN) try: pp = binom.cdf(sum(fragP[Centp:]),sum(fragP),float(motifEnd+1-motifStart)/(motifEnd+1-flankL)) pn = binom.cdf(sum(fragN[:Centn]),sum(fragN),float(motifEnd+1-motifStart)/(flankR+1-motifStart)) fosArray.append(pp*pn) except ZeroDivisionError: fosArray.append(1.0) fosCutoff = np.sort(fosArray)[4] #round(sum(1 for s in fosArray if s <= a)/500.0,2) <= 0.01: if fos <= fosCutoff: fps = 1.0##profile else: fps = 0.0##no profile #print tf_name+'\t'+motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(count)+'\t'+str(fos)+'\t'+str(fdr) else: fos = 1.0+(1/(count+1.0)) fps = -1.0 fosCutoff = -1.0 acces = 0.0 row = [count,acces,fos,fosCutoff,fps] writer.writerows([row]) fpsfile.close() return 0
def updateFOS(path, db, motifChrom="chr17", dgfCutoff=11, method="Binom", stranded=False):#, flankWin=35): """calculate fos from discontinuous variableStep wiggle files with two method options: NSD/Binomial test""" mcollection = db["hg19"+motifChrom] print 'updating fos', motifChrom expName="fos" for infile in glob.glob(os.path.join(path,"wgEncodeUwDnaseGm12878Aln_cut"+motifChrom)): #(wigpath,wigfile) = os.path.split(infile) #(wigfilename,ext) = os.path.splitext(infile) wigfilename = re.split(motifChrom,infile)[0]#"_._cut",infile)[0] ctName = wigfilename.split('EncodeUwDnase')[-1].split('Aln')[0] if not stranded: wigFile = open(infile,'rt') #wig = csv.reader(wigFile,delimiter='\t') #bwFile = os.path.join(path,wigfilename+'.bw') #countWig.compressVarWig(wigFile, expName, wigfilename) bwFile = wigfilename+motifChrom+'.bw' if not os.path.isfile(bwFile): countWig.compressVarWig(wigFile, expName, wigfilename) coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName) arrayDict = defaultdict(list) cursor = mcollection.find()#{"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}}) for test in cursor: if not motifChrom in arrayDict: arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName) xs, xvals, sums = arrayDict[motifChrom] motifStart = test["genomic_region"]["start"] motifEnd = test["genomic_region"]["end"] flankWin = round((motifEnd - motifStart + 1)*1.75) flankL = max(0, motifStart - flankWin) flankR = motifEnd + flankWin countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] if method == "NSD": try: fos = np.sqrt((count-countCent)/count)-np.sqrt(countCent/count) except ZeroDivisionError: fos = 0 elif method == "Binom": try: fos = min(1 - binom.cdf(countCent,countTotL,float(motifEnd-motifStart)/(motifEnd-flankL)), 1 - binom.cdf(countCent,countTotR,float(motifEnd-motifStart)/(flankR-motifStart))) except ZeroDivisionError: fos = 0 if fos > 0.95 and count-countCent > dgfCutoff:#(flankR-flankL-(motifEnd-motifStart)): mcollection.update({"_id":test["_id"]},{"$set":{"dnase.fos": fos}}, upsert = True) #print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos) else: infilep = os.path.join(path,"wgEncodeUwDnaseGm12878Aln_p_cut.wig") infilen = os.path.join(path,"wgEncodeUwDnaseGm12878Aln_n_cut.wig") wigFilep = open(infilep,'rt') wigFilen = open(infilen,'rt') bwFilep = wigfilename+'_p_cut'+motifChrom+'.bw' bwFilen = wigfilename+'_n_cut'+motifChrom+'.bw' if not os.path.isfile(bwFilep): countWig.compressVarWig(wigFilep,expName,wigfilename+'_p_cut') if not os.path.isfile(bwFilen): countWig.compressVarWig(wigFilen,expName,wigfilename+'_n_cut') coordDictp, valueDictp = countWig.getBinVarCoord(bwFilep,ctName) coordDictn, valueDictn = countWig.getBinVarCoord(bwFilen,ctName) arrayDictp = defaultdict(list) arrayDictn = defaultdict(list) cursor = mcollection.find() for test in cursor: if not motifChrom in arrayDictp: arrayDictp[motifChrom] = countWig.buildVarHist(motifChrom,coordDictp,valueDictp,ctName) if not motifChrom in arrayDictn: arrayDictn[motifChrom] = countWig.buildVarHist(motifChrom,coordDictn,valueDictn,ctName) xs, xvals, sums = arrayDictp[motifChrom] motifStart,motifEnd = test["genomic_region"]["start"],test["genomic_region"]["end"] flankWin = round((motifEnd - motifStart + 1)*1.75) flankL= max(0, motifStart-flankWin) flankR = motifEnd + flankWin countTotLp = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2] countCentp = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] countp = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] xs, xvals, sums = arrayDictn[motifChrom] countTotRn = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2] countCentn = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2] countn = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2] if method == "Binom": try: pp = 1 - binom.cdf(countCentp,countTotLp,float(motifEnd-motifStart)/(motifEnd-flankL)) pn = 1 - binom.cdf(countCentn,countTotRn,float(motifEnd-motifStart)/(flankR-motifStart)) fos = pp*pn except ZeroDivisionError: fos = 0 if fos > 0.95 and countp+countn-countCentp-countCentn > dgfCutoff: mcollection.update({"_id":test["_id"]},{"$set":{"dnase.fosStrand": fos}}, upsert = True) print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos) return 0