示例#1
0
def updateCount(path, tfName):
    """update count data in discontinuous variableStep wiggle format"""
    for infile in glob.glob(os.path.join(path,"*.wig")):
        #(wigpath,wigfilename) = os.path.split(infile)
	(wigfilename,ext) = os.path.splitext(infile)
        ##depends on the data type and source
        expName = "Dnase"##or add an expName parser line
        ctName = wigfilename.split('EncodeUwDnase')[-1].split('Aln')[0]
        wigFile = open(infile,'rt')
        #wig = csv.reader(wigFile,delimiter='\t')
	countWig.compressVarWig(wigFile, expName, wigfilename)
        coordDict, valuesDict = countWig.getBinVarCoord(wigfilename+motifChrom+'.bw',ctName)
        arrayDict = defaultdict(list)
        cursor = mcollection.find({"tf_name": tfName})
        for test in cursor:
            motifChrom = test["motif_genomic_regions_info"]["chr"]
	    motifStart = test["motif_genomic_regions_info"]["start"] 
	    motifEnd = test["motif_genomic_regions_info"]["end"]
            if not motifChrom in arrayDict:
                arrayDict[motifChrom] = countWig.buildHist(motifChrom,coordDict,valuesDict,ctName)
            xs, xvals, sums = arrayDict[motifChrom]
            count = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0]
            #print count
	    #mcollection.update({"_id":test["_id"]},{"$set":{expName: count}}, upsert = True)
            test["ct_info"]["accessibility_score"][expName] = count
            mcollection.save(test)
    return 0
示例#2
0
def updateCount(path, db, motifChrom='chr17', window=100):
    """update count data in discontinuous variableStep wiggle format"""
    mcollection = db["hg19"+motifChrom]
    for infile in glob.glob(os.path.join(path,"wgEncodeUwDgf*_cut"+motifChrom)):
        #(wigpath,wigfilename) = os.path.split(infile)
	#(wigfilename,ext) = os.path.splitext(infile)
	wigfilename = infile.split(motifChrom)[0]
        ##depends on the data type and source
        expName = "dgf"##or add an expName parser line
        ctName = wigfilename.split('EncodeUwDgf')[-1].split('Aln')[0]
        wigFile = open(infile,'rt')
        #wig = csv.reader(wigFile,delimiter='\t')
	if not os.path.isfile(wigfilename+motifChrom+'.bw'):
	    countWig.compressVarWig(wigFile, expName, wigfilename)
        coordDict, valuesDict = countWig.getBinVarCoord(wigfilename+motifChrom+'.bw',ctName)
        arrayDict = defaultdict(list)
        cursor = mcollection.find()#{"tf_name": tfName})
        for test in cursor:
            #motifChrom = test["genomic_region"]["chr"]
	    motifStart = test["genomic_region"]["start"] 
	    motifEnd = test["genomic_region"]["end"]
            if not motifChrom in arrayDict:
                arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName)
            xs, xvals, sums = arrayDict[motifChrom]
            count = countWig.queryHist(xs, xvals, sums, motifStart-window, motifEnd+window, varWindow=True)[0]
            #print count
	    mcollection.update({"_id":test["_id"]},{"$set":{expName+"."+ctName: count}}, upsert = True)
            #test["ct_info"]["accessibility_score"][expName] = count
            #mcollection.save(test)
    return 0
示例#3
0
def updateFOS(path, db, motifChrom="chr17", dgfCutoff=20, method="Binom"):#, flankWin=35):
    """calculate fos from discontinuous variableStep wiggle files
	with two method options:
		NSD/Binomial test"""
    mcollection = db["hg19"+motifChrom]
    print 'updating fos', motifChrom
    for infile in glob.glob(os.path.join(path,"wgEncodeUwDgf*_cut"+motifChrom)):
	#(wigpath,wigfile) = os.path.split(infile)
	#(wigfilename,ext) = os.path.splitext(infile)
	wigfilename = infile.split(motifChrom)[0]
	expName = "fos"
	ctName = wigfilename.split('EncodeUwDgf')[-1].split('Aln')[0]
	wigFile = open(infile,'rt')
	#wig = csv.reader(wigFile,delimiter='\t')
	#bwFile = os.path.join(path,wigfilename+'.bw')
	#countWig.compressVarWig(wigFile, expName, wigfilename)
	bwFile = wigfilename+motifChrom+'.bw'
	if not os.path.isfile(bwFile):
	    countWig.compressVarWig(wigFile, expName, wigfilename)
	coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName)
	arrayDict = defaultdict(list)
	cursor = mcollection.find({"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}})
		#{"tf_name": tfName,
#		"motif_score":{"$lt":1e-4},
#		"motif_genomic_regions_info.chr": motifChrom})
	for test in cursor:
	    if not motifChrom in arrayDict:
		arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName)
	    xs, xvals, sums = arrayDict[motifChrom]
	    motifStart = test["genomic_region"]["start"]
	    motifEnd = test["genomic_region"]["end"]
	    flankWin = round((motifEnd - motifStart + 1)*1.75)
	    flankL = max(0, motifStart - flankWin)
	    flankR = motifEnd + flankWin
	    countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
	    countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
	    countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
	    count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]
	    if method == "NSD":
		try:
		    fos = np.sqrt((count-countCent)/count)-np.sqrt(countCent/count)
		except ZeroDivisionError:
		    fos = 0 
	    elif method == "Binom":
		try:
		    fos = min(1 - binom.cdf(countCent,countTotL,float(motifEnd-motifStart)/(motifEnd-flankL)), 
			1 - binom.cdf(countCent,countTotR,float(motifEnd-motifStart)/(flankR-motifStart)))
		except ZeroDivisionError:
		    fos = 0

	    if fos > 0.95 and count-countCent > dgfCutoff:#(flankR-flankL-(motifEnd-motifStart)):
		mcollection.update({"_id":test["_id"]},{"$set":{"dgf.fos": fos}}, upsert = True)
	    	#print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos)
    return 0 	
示例#4
0
文件: getFps.py 项目: xc406/Mocap
def updateFPS(infile, outpath, tfname= "CTCF", motifChrom="chr15", ctName="Gm12878", dgfCutoff=36, expName="atacseq"):#, stranded=False):
        """calculate footprint scores from discontinuous variableStep wiggle files"""
	
	#check if directories exists
	motifdir = os.path.join(outpath,"bedMotifs")
	fpsdir = os.path.join(outpath,"fpsMotifs")
	if not os.path.isdir(motifdir):
		print "Error: path-to-motif-bed-files invalid, please specify a valid outpath to store all calculated scores."
		sys.exit()
		#os.mkdir(motifdir)
	if not os.path.isdir(fpsdir):
		os.mkdir(fpsdir)
	
	print 'updating fps for ', ctName, motifChrom

	wigfilename = re.split(".wig",infile)[0]#os.path.join(inpath,"SRR8912"+"68"+"sort_cut")
	gcoordsfile = gzip.open(os.path.join(motifdir,tfname+motifChrom+".bed.gz"),'r')
	gcoords = csv.reader(gcoordsfile, delimiter='\t')
	fpsfile = gzip.open(os.path.join(fpsdir,tfname+ctName+motifChrom+'fps.txt.gz'),'w')
	writer = csv.writer(fpsfile, delimiter='\t')

	##non-strand-specific fps for atac-seq
	if expName=="atacseq":
		wigFile = open(infile,'rt')
		bwFile = wigfilename+motifChrom+'.bw'
		if not os.path.isfile(bwFile):
	    		countWig.compressVarWig(wigFile, ctName, wigfilename)
		coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName)
		arrayDict = defaultdict(list)
		for test in gcoords:
	    		if not motifChrom in arrayDict:
				arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName)
	    		xs, xvals, sums = arrayDict[motifChrom]
	    		motifStart, motifEnd = int(test[1]), int(test[2])

	    		flankWin = round((motifEnd - motifStart + 1)*1.75)#35
	    		flankL = max(0, int(motifStart - flankWin))
	    		flankR = int(motifEnd + flankWin)
	    		countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
	    		countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
	    		countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
	    		count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]
			count = count-countCent
			if count >= dgfCutoff:
        			acces = 1.0
        			fragP = array("d")
				fragN = array("d")
				for i in xrange(flankL,motifEnd):
                			c = countWig.queryHist(xs, xvals, sums, i, i+1, varWindow=True)[2]
                			fragP.append(c)

        			#Centp = motifStart-flankL
        			countTotL = sum(fragP)#countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
        			for i in xrange(motifStart,flankR):
                			c = countWig.queryHist(xs, xvals, sums, i, i+1, varWindow=True)[2]
                			fragN.append(c)

        			#Centn = motifEnd+1-motifStart
        			countTotR = sum(fragN)#countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
				try:
                			pp = binom.cdf(countCent,countTotL,float(motifEnd+1-motifStart)/(motifEnd+1-flankL))
                			pn = binom.cdf(countCent,countTotR,float(motifEnd+1-motifStart)/(flankR+1-motifStart))
                			fos = pp*pn
        			except ZeroDivisionError:
                			fos = 1.0

        			##fdr correction
        			fosArray = array("d")
        			for s in xrange(500):
                			random.shuffle(fragP)
					random.shuffle(fragN)
                			try:
                        			pp = binom.cdf(sum(fragP[(motifStart-flankL):]),sum(fragP),float(motifEnd+1-motifStart)/(motifEnd+1-flankL))
                        			pn = binom.cdf(sum(fragN[:(motifEnd+1-motifStart)]),sum(fragN),float(motifEnd+1-motifStart)/(flankR+1-motifStart))
                        			fosArray.append(pp*pn)
                			except ZeroDivisionError:
                        			fosArray.append(1.0)
        			fosCutoff = np.sort(fosArray)[4]
        			#round(sum(1 for s in fosArray if s <= a)/500.0,2) <= 0.01:
        			if fos <= fosCutoff:
                			fps = 1.0##profile
        			else:
                			fps = 0.0##no profile
        			#print tf_name+'\t'+motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(count)+'\t'+str(fos)+'\t'+str(fdr)
			else:
                        	fos = 1.0+(1/(count+1.0))
                        	fps = -1.0
                        	fosCutoff = -1.0
                        	acces = 0.0
                	row = [count,acces,fos,fosCutoff,fps]
                	writer.writerows([row])

	#run strand-specific calls for DNase-Seq or DGF
    	else:
		shortname = os.path.join(os.path.split(wigfilename)[0],re.split("_",os.path.split(wigfilename)[1])[0])
		infilep = shortname + "_p.wig"#os.path.join(path,"wgEncodeUwDgf"+ctName+"Aln_p_cut.wig")
		infilen = shortname + "_n.wig"#os.path.join(path,"wgEncodeUwDgf"+ctName+"Aln_n_cut.wig")
		wigFilep = open(infilep,'rt')
		wigFilen = open(infilen,'rt')
		bwFilep = shortname+'_p'+motifChrom+'.bw'
		bwFilen = shortname+'_n'+motifChrom+'.bw'
		if not os.path.isfile(bwFilep):
			countWig.compressVarWig(wigFilep,expName,shortname+'_p')
		if not os.path.isfile(bwFilen):
			countWig.compressVarWig(wigFilen,expName,shortname+'_n')
		coordDictp, valueDictp = countWig.getBinVarCoord(bwFilep,ctName)
		coordDictn, valueDictn = countWig.getBinVarCoord(bwFilen,ctName)
		arrayDictp = defaultdict(list)
		arrayDictn = defaultdict(list)

		for test in gcoords:
			if not motifChrom in arrayDictp:
				arrayDictp[motifChrom] = countWig.buildVarHist(motifChrom,coordDictp,valueDictp,ctName)
			if not motifChrom in arrayDictn:
				arrayDictn[motifChrom] = countWig.buildVarHist(motifChrom,coordDictn,valueDictn,ctName)

			motifStart,motifEnd = int(test[1]),int(test[2])
			flankWin = round((motifEnd - motifStart + 1)*2)#1.75)
			flankL= max(0, int(motifStart-flankWin))
			flankR = int(motifEnd + flankWin)

			xsp, xvalsp, sumsp = arrayDictp[motifChrom]
			xsn, xvalsn, sumsn = arrayDictn[motifChrom]

			countCentp = countWig.queryHist(xsp, xvalsp, sumsp, motifStart, motifEnd)[2]
			countCentn = countWig.queryHist(xsn, xvalsn, sumsn, motifStart, motifEnd)[2]
			countp = countWig.queryHist(xsp, xvalsp, sumsp, motifStart-100, motifEnd+100, varWindow=True)[2]
			countn = countWig.queryHist(xsn, xvalsn, sumsn, motifStart-100, motifEnd+100, varWindow=True)[2]
			
			count = countp+countn-countCentp-countCentn
			if count >= dgfCutoff:
				acces = 1.0
				fragP = array("d")
				fragN = array("d")
				for i in xrange(flankL,motifEnd):
					c = countWig.queryHist(xsp, xvalsp, sumsp, i, i+1, varWindow=True)[2]
					fragP.append(c)

				Centp = motifStart-flankL
				countTotLp = sum(fragP)#countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
				#countCentp = sum(fragP[Centp:])#countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
				#countp = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]

				for i in xrange(motifStart,flankR):
                                	c = countWig.queryHist(xsn, xvalsn, sumsn, i, i+1, varWindow=True)[2]
                                	fragN.append(c) 

				Centn = motifEnd+1-motifStart
				countTotRn = sum(fragN)#countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
				#countCentn = sum(fragN[:Centn])#countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
				#countn = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]

				try:
					pp = binom.cdf(countCentp,countTotLp,float(motifEnd+1-motifStart)/(motifEnd+1-flankL))
					pn = binom.cdf(countCentn,countTotRn,float(motifEnd+1-motifStart)/(flankR+1-motifStart))
					fos = pp*pn
				except ZeroDivisionError:
					fos = 1.0

				##fdr correction
				fosArray = array("d")
				for s in xrange(500):
					random.shuffle(fragP)
					random.shuffle(fragN)
					try:
						pp = binom.cdf(sum(fragP[Centp:]),sum(fragP),float(motifEnd+1-motifStart)/(motifEnd+1-flankL))
						pn = binom.cdf(sum(fragN[:Centn]),sum(fragN),float(motifEnd+1-motifStart)/(flankR+1-motifStart))
						fosArray.append(pp*pn)
					except ZeroDivisionError:
						fosArray.append(1.0)
				fosCutoff = np.sort(fosArray)[4]
				#round(sum(1 for s in fosArray if s <= a)/500.0,2) <= 0.01:
				if fos <= fosCutoff:
					fps = 1.0##profile
				else:
					fps = 0.0##no profile
				#print tf_name+'\t'+motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(count)+'\t'+str(fos)+'\t'+str(fdr)
			else:
				fos = 1.0+(1/(count+1.0))
				fps = -1.0
				fosCutoff = -1.0
				acces = 0.0
			row = [count,acces,fos,fosCutoff,fps]
			writer.writerows([row])
	fpsfile.close()	
        return 0 
示例#5
0
文件: mongodb_fos.py 项目: xc406/code
def updateFOS(path, db, motifChrom="chr17", dgfCutoff=11, method="Binom", stranded=False):#, flankWin=35):
    """calculate fos from discontinuous variableStep wiggle files
	with two method options:
		NSD/Binomial test"""
    mcollection = db["hg19"+motifChrom]
    print 'updating fos', motifChrom
    expName="fos"
    for infile in glob.glob(os.path.join(path,"wgEncodeUwDnaseGm12878Aln_cut"+motifChrom)):
	#(wigpath,wigfile) = os.path.split(infile)
	#(wigfilename,ext) = os.path.splitext(infile)
	wigfilename = re.split(motifChrom,infile)[0]#"_._cut",infile)[0]
	ctName = wigfilename.split('EncodeUwDnase')[-1].split('Aln')[0]
	if not stranded:
		wigFile = open(infile,'rt')
		#wig = csv.reader(wigFile,delimiter='\t')
		#bwFile = os.path.join(path,wigfilename+'.bw')
		#countWig.compressVarWig(wigFile, expName, wigfilename)
		bwFile = wigfilename+motifChrom+'.bw'
		if not os.path.isfile(bwFile):
	    		countWig.compressVarWig(wigFile, expName, wigfilename)
		coordDict, valuesDict = countWig.getBinVarCoord(bwFile,ctName)
		arrayDict = defaultdict(list)
		cursor = mcollection.find()#{"tf_name":{"$in": ["IRF3","MAFK","NFYA","SIN3A","ZNF384"]}})
		for test in cursor:
	    		if not motifChrom in arrayDict:
				arrayDict[motifChrom] = countWig.buildVarHist(motifChrom,coordDict,valuesDict,ctName)
	    		xs, xvals, sums = arrayDict[motifChrom]
	    		motifStart = test["genomic_region"]["start"]
	    		motifEnd = test["genomic_region"]["end"]
	    		flankWin = round((motifEnd - motifStart + 1)*1.75)
	    		flankL = max(0, motifStart - flankWin)
	    		flankR = motifEnd + flankWin
	    		countTotL = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
	    		countTotR = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
	    		countCent = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
	    		count = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]
	    		if method == "NSD":
				try:
		    			fos = np.sqrt((count-countCent)/count)-np.sqrt(countCent/count)
				except ZeroDivisionError:
		    			fos = 0 
	    		elif method == "Binom":
				try:
		    			fos = min(1 - binom.cdf(countCent,countTotL,float(motifEnd-motifStart)/(motifEnd-flankL)), 
					1 - binom.cdf(countCent,countTotR,float(motifEnd-motifStart)/(flankR-motifStart)))
				except ZeroDivisionError:
		    			fos = 0
	    		if fos > 0.95 and count-countCent > dgfCutoff:#(flankR-flankL-(motifEnd-motifStart)):
				mcollection.update({"_id":test["_id"]},{"$set":{"dnase.fos": fos}}, upsert = True)
	    			#print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos)
    	else:
		infilep = os.path.join(path,"wgEncodeUwDnaseGm12878Aln_p_cut.wig")
		infilen = os.path.join(path,"wgEncodeUwDnaseGm12878Aln_n_cut.wig")
		wigFilep = open(infilep,'rt')
		wigFilen = open(infilen,'rt')
		bwFilep = wigfilename+'_p_cut'+motifChrom+'.bw'
		bwFilen = wigfilename+'_n_cut'+motifChrom+'.bw'
		if not os.path.isfile(bwFilep):
			countWig.compressVarWig(wigFilep,expName,wigfilename+'_p_cut')
		if not os.path.isfile(bwFilen):
			countWig.compressVarWig(wigFilen,expName,wigfilename+'_n_cut')
		coordDictp, valueDictp = countWig.getBinVarCoord(bwFilep,ctName)
		coordDictn, valueDictn = countWig.getBinVarCoord(bwFilen,ctName)
		arrayDictp = defaultdict(list)
		arrayDictn = defaultdict(list)
		cursor = mcollection.find()
		for test in cursor:
			if not motifChrom in arrayDictp:
				arrayDictp[motifChrom] = countWig.buildVarHist(motifChrom,coordDictp,valueDictp,ctName)
			if not motifChrom in arrayDictn:
				arrayDictn[motifChrom] = countWig.buildVarHist(motifChrom,coordDictn,valueDictn,ctName)
			xs, xvals, sums = arrayDictp[motifChrom]
			motifStart,motifEnd = test["genomic_region"]["start"],test["genomic_region"]["end"]
			flankWin = round((motifEnd - motifStart + 1)*1.75)
			flankL= max(0, motifStart-flankWin)
			flankR = motifEnd + flankWin
			countTotLp = countWig.queryHist(xs, xvals, sums, flankL, motifEnd, varWindow=True)[2]
			countCentp = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
			countp = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]
			xs, xvals, sums = arrayDictn[motifChrom] 
			countTotRn = countWig.queryHist(xs, xvals, sums, motifStart, flankR, varWindow=True)[2]
			countCentn = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[2]
			countn = countWig.queryHist(xs, xvals, sums, motifStart-100, motifEnd+100, varWindow=True)[2]
			if method == "Binom":
				try:
					pp = 1 - binom.cdf(countCentp,countTotLp,float(motifEnd-motifStart)/(motifEnd-flankL))
					pn = 1 - binom.cdf(countCentn,countTotRn,float(motifEnd-motifStart)/(flankR-motifStart))
					fos = pp*pn
				except ZeroDivisionError:
					fos = 0
			if fos > 0.95 and countp+countn-countCentp-countCentn > dgfCutoff:
				mcollection.update({"_id":test["_id"]},{"$set":{"dnase.fosStrand": fos}}, upsert = True)
				print motifChrom+'\t'+str(motifStart)+'\t'+str(motifEnd)+'\t'+str(fos)
    return 0