def updateCons(path, db, motifChrom='chr17'): """update conservation scores in gzipped fixedStep wiggle format""" mcollection = db["hg19"+motifChrom] for infile in glob.glob(os.path.join(path, "*.wigFix.gz")): (wigpath,wigfilename) = os.path.split(infile) chrom = wigfilename.split('.')[0] consName = '_'.join(wigfilename.split('.')[1:-2]) print 'updating', consName #print chrom, tfName, consName with gzip.open(infile) as wigFile: #wig = csv.reader(wigFile,delimiter='\t') bwFile = os.path.join(wigpath,consName+'.bw') if not os.path.isfile(bwFile): countWig.compressFixWig(wigFile, consName, bwFile) stepDict, startDict, valuesDict = countWig.getBinFixStart(bwFile,consName) start = startDict[consName][chrom] arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName) cursor = mcollection.find()#{"tf_name": tfName, #"genomic_region.chr": chrom}) print mcollection.count() #num = 0 #avg = 0 for test in cursor: motifStart, motifEnd = test["genomic_region"]["start"], test["genomic_region"]["end"] #num += 1 #print avg#motifStart, motifEnd, num avg = 0 startlist = [start[i] for i in xrange(len(start)-1) if (motifStart >= start[i] and motifStart < start[i+1]) or (motifEnd >= start[i] and motifEnd < start[i+1])] if motifEnd > start[-1]: startlist.append(start[-1]) for i in xrange(len(startlist)): #if avg != 0: #if motifEnd >= startlist[i]:##cases of partial overlap need to renormalize over two fragments ss = startlist[i] xs, xvals, sums, ll = arrayDict[ss] if motifStart < ss <= motifEnd <= ss+ll-1:##left out, right in if avg == 'NA' and i == len(startlist)-1: avg = 0 avg += countWig.queryHist(xs,xvals, sums, ss, motifEnd)[0] *(motifEnd - ss + 1) /(motifEnd - motifStart + 1) elif ss <= motifStart < motifEnd <= ss+ll-1:##in array avg = countWig.queryHist(xs,xvals, sums, motifStart, motifEnd)[0] elif motifStart < ss and ss+ll-1 < motifEnd:##motif > array if avg == 'NA': avg = 0 avg += countWig.queryHist(xs,xvals, sums, ss, ss+ll-1)[0] * ll /(motifEnd - motifStart + 1) elif ss <= motifStart <= ss+ll-1 < motifEnd:##left in, right out if avg == 'NA' and i == len(startlist)-1: avg = 0 avg += countWig.queryHist(xs,xvals, sums, motifStart, ss+ll-1)[0] *(ss + ll - motifStart) /(motifEnd - motifStart + 1) elif ss+ll-1 < motifStart: if avg == 0: #print 'here-->', motifStart, motifEnd, ss, ll avg = 'NA' elif motifEnd < ss: print "this should not happen-- motifStart < motifEnd < ss " if avg == 0: avg = 'NA' if avg != 'NA': mcollection.update({"_id":test["_id"]},{"$set":{"cons."+consName: avg}}, upsert = True) return 0
def getCons(path, tfName): for infile in glob.glob(os.path.join(path, "*.wigFix")): (wigpath,wigfilename) = os.path.split(infile) chrom = wigfilename.split('.')[0] consName = '_'.join(wigfilename.split('.')[1:3]) #print chrom, tfName, consName with open(infile,'rt') as wigFile: wig = csv.reader(wigFile,delimiter='\t') stepDict, startDict, valuesDict = countWig.getFixStart(wig,consName)#'phyloP30wayEuarchontoglires') start = startDict[consName][chrom] arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName) cursor = mcollection.find({"tf_name": tfName, "motif_genomic_regions_info.chr": chrom}) for test in cursor: motifStart, motifEnd = test["motif_genomic_regions_info"]["start"], test["motif_genomic_regions_info"]["end"] avg = 0 #print motifStart, motifEnd startlist = [start[i] for i in xrange(len(start)-1) if (motifStart > start[i] and motifStart < start[i+1]) or (motifEnd > start[i] and motifEnd < start[i+1])] if len(startlist) > 0: print startlist #print start[-1] startlist.append(start[-1]) for i in xrange(len(startlist)): #print arrayDict[start[i]] # if avg != 0 and motifEnd < start[i]: # break ##fall into range and break out if avg != 0: if motifEnd > startlist[i]:##cases of partial overlap need to renormalize over two fragments xs, xvals, sums = arrayDict[startlist[i]] avg = avg * (startlist[i] - motifStart) + (countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0] * (motifEnd - startlist[i] + 1)) / (motifEnd - motifStart + 1) else: break elif avg == 0: xs, xvals, sums = arrayDict[startlist[i]] avg = countWig.queryHist(xs, xvals, sums, motifStart, motifEnd)[0] if avg > 0: print avg, motifStart, motifEnd mcollection.update({"_id":test["_id"]},{"$set":{"motif_cons_info":{consName: avg}}}, upsert = True) else: mcollection.update({"_id":test["_id"]},{"$set":{"motif_cons_info":{consName: avg}}}, upsert = True) #mcollection.save(test) return 0
def updateCons(inpath, tfname, motifChrom, outpath): """update conservation scores in gzipped fixedStep wiggle format""" #check if directories exists motifdir = os.path.join(outpath,"bedMotifs") consdir = os.path.join(outpath,"consMotifs") if not os.path.isdir(motifdir): print "Error: path-to-motif-bed-files invalid, please specify a valid outpath to store all calculated scores." sys.exit() #os.mkdir(motifdir) if not os.path.isdir(consdir): os.mkdir(consdir) gcoordsfile = gzip.open(os.path.join(motifdir,tfname+motifChrom+".bed.gz")) consfile = gzip.open(os.path.join(consdir,tfname+motifChrom+'cons.txt.gz'),'w') writer = csv.writer(consfile, delimiter='\t') l = [] consTypes = ["phastCons100way","phastCons46way","phastCons46way.placental","phastCons46way.primates", "phyloP100way","phyloP46way","phyloP46way.placental","phyloP46way.primate"] for consType in consTypes: infile = os.path.join(inpath, motifChrom+"."+consType+".wigFix.gz") (wigpath,wigfilename) = os.path.split(infile) chrom = wigfilename.split('.')[0] consName = '_'.join(wigfilename.split('.')[1:-2]) print 'updating', consName gcoordsfile.seek(0) gcoords = csv.reader(gcoordsfile, delimiter='\t') with gzip.open(infile) as wigFile: bwFile = os.path.join(wigpath,motifChrom+"."+consName+'.bw') if not os.path.isfile(bwFile): countWig.compressFixWig(wigFile, consName, bwFile) stepDict, startDict, valuesDict = countWig.getBinFixStart(bwFile,consName) start = startDict[consName][chrom] arrayDict = countWig.buildFixHist(chrom,stepDict,startDict,valuesDict,consName) r = [] for test in gcoords: motifStart, motifEnd = int(test[1]), int(test[2]) #print motifStart, motifEnd avg = 0 startlist = [start[i] for i in xrange(len(start)-1) if (motifStart >= start[i] and motifStart < start[i+1]) or (motifEnd >= start[i] and motifEnd < start[i+1])] if motifEnd > start[-1]: startlist.append(start[-1]) for i in xrange(len(startlist)): #if avg != 0: #if motifEnd >= startlist[i]:##cases of partial overlap need to renormalize over two fragments ss = startlist[i] xs, xvals, sums, ll = arrayDict[ss] if motifStart < ss <= motifEnd <= ss+ll-1:##left out, right in if avg == 'NA' and i == len(startlist)-1: avg = 0 avg += countWig.queryHist(xs,xvals, sums, ss, motifEnd)[0] *(motifEnd - ss + 1) /(motifEnd - motifStart + 1) elif ss <= motifStart < motifEnd <= ss+ll-1:##in array avg = countWig.queryHist(xs,xvals, sums, motifStart, motifEnd)[0] elif motifStart < ss and ss+ll-1 < motifEnd:##motif > array if avg == 'NA': avg = 0 avg += countWig.queryHist(xs,xvals, sums, ss, ss+ll-1)[0] * ll /(motifEnd - motifStart + 1) elif ss <= motifStart <= ss+ll-1 < motifEnd:##left in, right out if avg == 'NA' and i == len(startlist)-1: avg = 0 avg += countWig.queryHist(xs,xvals, sums, motifStart, ss+ll-1)[0] *(ss + ll - motifStart) /(motifEnd - motifStart + 1) elif ss+ll-1 < motifStart: if avg == 0: #print '...', motifStart, motifEnd, ss, ll avg = 'NA' elif motifEnd < ss: print "Error: motifStart < motifEnd < ss " if avg == 0: avg = 'NA' r.append(avg) l.append(tuple(r)) wl = zip(*l) for i in wl: writer.writerows([list(i)]) consfile.close() return 0