def makeWig(fN, assembly, format=None, name=None): '''format assumes bowtie suitible for medium mapped files. takes longer.''' #assume bowtie if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) if not name: name = cg.getBaseFileName(fN, naked=True) lDict = cg.returnChromLengthDict(assembly) for chrom in lDict: if not chrom in cg.acceptableChroms: continue for strand in ['1', '-1']: f = open(fN, 'r') #create hitmap of chrom and strand print chrom, strand, 'hitmap' hitDict = {} for line in f: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) lStrand = str(lStrand) start = int(start) end = int(end) if chrom == lChrom and strand == lStrand: for i in range(start, end + 1): try: hitDict[i] += 1 except KeyError: hitDict[i] = 1 #write results to wig file writeWigFromHitDict(hitDict, assembly)
def makePeakInputQ(cName, minExpression=2000): '''Uses shell script and qsub to get peaks quickly''' mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1', '-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 for i in rangePoints(1, chromLens[chrom], 30): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) log = 'logs/o-' + str(start) elog = 'logs/e-%s-%s-%s-%s' % (chrom, strand, start, end) subprocess.Popen([ 'qsub', '-V', '-cwd', '-e', elog, '-o', log, '-l', 'mem=3G', '-l', 'rt=3600', 'q.sh', tcc, cName, str(minExpression) ]).wait()
def makePeakInputQ(cName, minExpression = 2000): '''Uses shell script and qsub to get peaks quickly''' mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1','-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 for i in rangePoints(1, chromLens[chrom], 30): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) log = 'logs/o-' + str(start) elog = 'logs/e-%s-%s-%s-%s' % (chrom, strand, start, end) subprocess.Popen(['qsub', '-V', '-cwd', '-e', elog, '-o', log, '-l', 'mem=3G', '-l', 'rt=3600', 'q.sh', tcc, cName, str(minExpression)]).wait()
def makeWig(fN, assembly, format = None, name = None): '''format assumes bowtie suitible for medium mapped files. takes longer.''' #assume bowtie if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) if not name: name = cg.getBaseFileName(fN, naked = True) lDict = cg.returnChromLengthDict(assembly) for chrom in lDict: if not chrom in cg.acceptableChroms: continue for strand in ['1', '-1']: f = open(fN, 'r') #create hitmap of chrom and strand print chrom, strand, 'hitmap' hitDict = {} for line in f: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) lStrand = str(lStrand) start = int(start) end = int(end) if chrom == lChrom and strand == lStrand: for i in range(start, end + 1): try: hitDict[i] += 1 except KeyError: hitDict[i] = 1 #write results to wig file writeWigFromHitDict(hitDict, assembly)
def updateWigLength(fN, assembly): chromLengths = cg.returnChromLengthDict(assembly) f = open(fN, 'r') header = f.readline() #header lineDict = {} # chr : [] for line in f: chrom = line.split('\t')[0] if chrom not in lineDict: lineDict[chrom] = [] lineDict[chrom].append(line) else: lineDict[chrom].append(line) f.close() for chrom in lineDict: print 'extending', chrom chromLength = chromLengths[chrom] lastChromLine = lineDict[chrom][-1] lastValue = int(lastChromLine.split('\t')[2]) print lastValue if lastValue < chromLength: print ' ', lastValue, chromLength lineDict[chrom].append('%s\t%s\t%s\t0.000000\n' % (chrom, lastValue, chromLength)) print ' updated' f = open(fN, 'w') f.write(header) for chrom in lineDict: f.writelines(lineDict[chrom]) f.close()
def writeWigDictToWig(wigDict, chrom, strand, assembly, name, outDir, blankValue = 0): '''hopefully the coords are in zero based. And this script will convert it to 0,1''' #init coords = sorted(wigDict.keys()) lDict = bioLibCG.returnChromLengthDict('hg19') chromEnd = lDict[chrom] outFN = outDir + '/%s.%s.%s.wig' % (name, chrom, strand) f = open(outFN, 'w') #write first blank line f.write('%s\t%s\t%s\t%s\n' % (chrom, 0, coords[0], blankValue)) #p.tell(' 'beginning block', coords[0] prevCoord = coords[0] prevValue = wigDict[coords[0]] blockStart = prevCoord coords = coords[1:] for coord in coords: currValue = wigDict[coord] if coord - 1 == prevCoord: #Does the value differ? if currValue == prevValue: #keep extending block prevCoord = coord prevValue = currValue else: #p.tell(' 'writing last equal block', blockStart, prevCoord, prevValue #finish last block, with NO blank block f.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, prevCoord + 1, prevValue)) #init next block prevCoord = coord prevValue = currValue blockStart = coord else: #finish last block, write zero block, start another block #last #p.tell(' 'finishing last block', blockStart, prevCoord f.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, prevCoord + 1, prevValue)) #zero #p.tell(' 'zero After:', prevCoord, coord f.write('%s\t%s\t%s\t%s\n' % (chrom, prevCoord + 1, coord, blankValue)) #init next block prevCoord = coord blockStart = coord prevValue = currValue #write last block and last blank block line f.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, coord + 1, prevValue)) f.write('%s\t%s\t%s\t%s\n' % (chrom, coord + 1, chromEnd, blankValue)) f.close()
def writeWigFromHitDict(hitDict, assembly, name, directory=None): mConf = c.getConfig('Main.conf') if not directory: directory = mConf.conf['wigs'] if not name: name = cg.getBaseFileName(name, naked=True) lDict = cg.returnChromLengthDict(assembly) cg.clearDirectory(directory, overwrite=False) #write results to wig file for chrom in hitDict: for strand in hitDict[chrom]: oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w') oF.write('track type=bedGraph name=%s.%s.%s\n' % (name, chrom, strand)) #print ' sorting' #print hitDict[chrom] chromEnd = lDict[chrom] # hitDict[chrom][strand][chromEnd] = 0 keys = hitDict[chrom][strand].keys() keys.sort() #print ' writing blocks' prevVal = 0 prevCoord = 0 blockStart = 0 blockEnd = 1 for key in keys: val = hitDict[chrom][strand][key] if prevCoord == key - 1: if val == prevVal: #should be combined blockEnd = key + 1 else: #no zero block #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #!make it a float value? #start new block blockStart = key blockEnd = key + 1 else: #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #write zero block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0)) #start new block blockStart = key blockEnd = key + 1 prevVal = val prevCoord = key oF.close()
def makeWigMem(fN, assembly, format=None, name=None, directory=None): '''format assumes bowtie suitible for small mapped files.''' if not name: name = cg.getBaseFileName(fN, naked=True) if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) lDict = cg.returnChromLengthDict(assembly) f = open(fN, 'r') f.readline() #header...file might not have one but its one read... #create hitmap of chrom and strand hitDict = {} #format = chr: { strand : { coord : value for line in f: try: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) except AttributeError: continue lStrand = str(lStrand) start = int(start) end = int(end) if lChrom in cg.acceptableChroms: #wig for degradome if lStrand == '1': i = start + 20 else: i = start try: hitDict[lChrom][lStrand][i] += 1 except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = 1 ''' for i in range(start, end): try: hitDict[lChrom][lStrand][i] += 1 except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = 1 ''' f.close() #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def makeWigMem(fN, assembly, format = None, name = None, directory = None): '''format assumes bowtie suitible for small mapped files.''' if not name: name = cg.getBaseFileName(fN, naked = True) if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) lDict = cg.returnChromLengthDict(assembly) f = open(fN, 'r') f.readline() #header...file might not have one but its one read... #create hitmap of chrom and strand hitDict = {} #format = chr: { strand : { coord : value for line in f: try: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) except AttributeError: continue lStrand = str(lStrand) start = int(start) end = int(end) if lChrom in cg.acceptableChroms: #wig for degradome if lStrand == '1': i = start + 20 else: i = start try: hitDict[lChrom][lStrand][i] += 1 except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = 1 ''' for i in range(start, end): try: hitDict[lChrom][lStrand][i] += 1 except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = 1 ''' f.close() #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def writeWigFromHitDict(hitDict, assembly, name, directory = None): mConf = c.getConfig('Main.conf') if not directory: directory = mConf.conf['wigs'] if not name: name = cg.getBaseFileName(name, naked = True) lDict = cg.returnChromLengthDict(assembly) cg.clearDirectory(directory, overwrite = False) #write results to wig file for chrom in hitDict: for strand in hitDict[chrom]: oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w') oF.write('track type=bedGraph name=%s.%s.%s\n' % (name, chrom, strand)) #print ' sorting' #print hitDict[chrom] chromEnd = lDict[chrom] # hitDict[chrom][strand][chromEnd] = 0 keys = hitDict[chrom][strand].keys() keys.sort() #print ' writing blocks' prevVal = 0 prevCoord = 0 blockStart = 0 blockEnd = 1 for key in keys: val = hitDict[chrom][strand][key] if prevCoord == key - 1: if val == prevVal:#should be combined blockEnd = key + 1 else: #no zero block #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #!make it a float value? #start new block blockStart = key blockEnd = key + 1 else: #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #write zero block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0)) #start new block blockStart = key blockEnd = key + 1 prevVal = val prevCoord = key oF.close()
def writeSetToWig(wigSet, chrom, strand, assembly, name, outDir): print 'if TP in set', (208148750 in wigSet) #init coords = sorted(wigSet) lDict = bioLibCG.returnChromLengthDict('hg19') chromEnd = lDict[chrom] outFN = outDir + '/%s.%s.%s.wig' % (name, chrom, strand) f = open(outFN, 'w') #write first 0 line f.write('%s\t%s\t%s\t%s\n' % (chrom, 0, coords[0], 0)) prevCoord = coords[0] blockStart = prevCoord coords = coords[1:] for coord in coords: if coord - 1 == prevCoord: #keep extending block prevCoord = coord else: #finish last block, write zero block, start another block #last f.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, prevCoord + 1, 1)) #zero f.write('%s\t%s\t%s\t%s\n' % (chrom, prevCoord + 1, coord, 0)) #init next block prevCoord = coord blockStart = coord #write last block and last 0 block line f.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, coord + 1, 1)) f.write('%s\t%s\t%s\t%s\n' % (chrom, coord + 1, chromEnd, 0)) f.close()
def spacerDistData(tranFN, outFN): '''chr strand tranStart tranEnd''' chrom_length = bioLibCG.returnChromLengthDict('hg19') chrom_strand_iSet = {} for chrom in chrom_length: for strand in ('+', '-'): chrom_strand_iSet.setdefault(chrom, {}).setdefault(strand, IntervalSet()) print 'making intervals' f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tranStart, tranEnd = int(ls[3]), int(ls[4]) strand = ls[2] chrom = ls[1] chrom_strand_iSet[chrom][strand].add(Interval(tranStart, tranEnd)) f.close() spacerData = [] print 'creating spacer data' for chrom in chrom_strand_iSet: for strand in chrom_strand_iSet[chrom]: iSet = chrom_strand_iSet[chrom][strand] for i, interv in enumerate(iSet): if interv == iSet[-1]: break nextInterv = iSet[i + 1] seperation = nextInterv.lower_bound - interv.upper_bound spacerData.append(seperation) f = open(outFN, 'w') outLines = [str(x) + '\n' for x in spacerData] f.writelines(outLines) f.close()
def makePeakInput(cName, minExpression=2000): mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) f = open('peakData.%s' % minExpression, 'w') for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1', '-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 endCheck = 0 for i in rangePoints(1, chromLens[chrom], 1000): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) #print 'scanning range', tcc peaks = cgPeaks.stretch(tcc, cName) peaks.createPeaks(span=3, minVal=minExpression) for x in peaks.peaks: if x < endCheck: continue #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange) and high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) else: continue else: continue endCheck = x + high #filter out peaks that look a certain way. if 14 < highest < 26: #rooflength if val[0] < 0.2 and val[1] < .2: #drop values goodTcc = cg.makeTcc(chrom, strand, x + low, x + high) #print goodTcc f.write('%s\n' % goodTcc) f.close()
def makeWigMem(fN, assembly, format = None, name = None, directory = None, degWig = False, switchStrand = True, normalized = False): '''format assumes bowtie suitible for small mapped files. switch strand does not switch the strands, it just makes sure if the data is backwards (HeLa) that it will put the peak in the right spot''' print 'degWig Value', degWig print 'switch strands?', switchStrand if not name: name = cg.getBaseFileName(fN, naked = True) if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) lDict = cg.returnChromLengthDict(assembly) f = open(fN, 'r') f.readline() #header...file might not have one but its one read... #create hitmap of chrom and strand hitDict = {} #format = chr: { strand : { coord : value for line in f: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) lStrand = str(lStrand) start = int(start) end = int(end) numPlacesMapped = int(line.strip().split('\t')[6]) numPlacesMapped += 1 readCount = 1 if normalized: readCount = float(readCount)/numPlacesMapped if lChrom in cg.acceptableChroms: if degWig: #wig for degradome NOTE:!!! change lStrand == '1' to '-1' for Bracken! if switchStrand: if lStrand == '1': i = start + (end - start) else: i = start + 1 else: if lStrand == '-1': i = start + (end - start) else: i = start + 1 hitDict.setdefault(lChrom, {}).setdefault(lStrand, {}) hitDict[lChrom][lStrand][i] = hitDict[lChrom][lStrand].get(i, 0) + readCount else: #wig for regular for i in range(start, end): try: hitDict[lChrom][lStrand][i] += readCount except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = readCount f.close() #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def writeWigDictToWig(wigDict, chrom, strand, assembly, name, outDir, blankValue=0): '''hopefully the coords are in zero based. And this script will convert it to 0,1''' #init coords = sorted(wigDict.keys()) lDict = bioLibCG.returnChromLengthDict('hg19') chromEnd = lDict[chrom] outFN = outDir + '/%s.%s.%s.wig' % (name, chrom, strand) f = open(outFN, 'w') #write first blank line f.write('%s\t%s\t%s\t%s\n' % (chrom, 0, coords[0], blankValue)) #p.tell(' 'beginning block', coords[0] prevCoord = coords[0] prevValue = wigDict[coords[0]] blockStart = prevCoord coords = coords[1:] for coord in coords: currValue = wigDict[coord] if coord - 1 == prevCoord: #Does the value differ? if currValue == prevValue: #keep extending block prevCoord = coord prevValue = currValue else: #p.tell(' 'writing last equal block', blockStart, prevCoord, prevValue #finish last block, with NO blank block f.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, prevCoord + 1, prevValue)) #init next block prevCoord = coord prevValue = currValue blockStart = coord else: #finish last block, write zero block, start another block #last #p.tell(' 'finishing last block', blockStart, prevCoord f.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, prevCoord + 1, prevValue)) #zero #p.tell(' 'zero After:', prevCoord, coord f.write('%s\t%s\t%s\t%s\n' % (chrom, prevCoord + 1, coord, blankValue)) #init next block prevCoord = coord blockStart = coord prevValue = currValue #write last block and last blank block line f.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, coord + 1, prevValue)) f.write('%s\t%s\t%s\t%s\n' % (chrom, coord + 1, chromEnd, blankValue)) f.close()
def makeWigMem(fN, assembly, format=None, name=None, directory=None, degWig=False, switchStrand=True, normalized=False): '''format assumes bowtie suitible for small mapped files. switch strand does not switch the strands, it just makes sure if the data is backwards (HeLa) that it will put the peak in the right spot''' print 'degWig Value', degWig print 'switch strands?', switchStrand if not name: name = cg.getBaseFileName(fN, naked=True) if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) lDict = cg.returnChromLengthDict(assembly) f = open(fN, 'r') f.readline() #header...file might not have one but its one read... #create hitmap of chrom and strand hitDict = {} #format = chr: { strand : { coord : value for line in f: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) lStrand = str(lStrand) start = int(start) end = int(end) numPlacesMapped = int(line.strip().split('\t')[6]) numPlacesMapped += 1 readCount = 1 if normalized: readCount = float(readCount) / numPlacesMapped if lChrom in cg.acceptableChroms: if degWig: #wig for degradome NOTE:!!! change lStrand == '1' to '-1' for Bracken! if switchStrand: if lStrand == '1': i = start + (end - start) else: i = start + 1 else: if lStrand == '-1': i = start + (end - start) else: i = start + 1 hitDict.setdefault(lChrom, {}).setdefault(lStrand, {}) hitDict[lChrom][lStrand][i] = hitDict[lChrom][lStrand].get( i, 0) + readCount else: #wig for regular for i in range(start, end): try: hitDict[lChrom][lStrand][i] += readCount except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = readCount f.close() #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def makePeakInput(cName, minExpression = 2000): mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) f = open('peakData.%s' % minExpression, 'w') for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1', '-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 endCheck = 0 for i in rangePoints(1, chromLens[chrom], 1000): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) #print 'scanning range', tcc peaks = cgPeaks.stretch(tcc, cName) peaks.createPeaks(span = 3, minVal = minExpression) for x in peaks.peaks: if x < endCheck: continue #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange) and high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) else: continue else: continue endCheck = x + high #filter out peaks that look a certain way. if 14 < highest < 26: #rooflength if val[0] < 0.2 and val[1] < .2: #drop values goodTcc = cg.makeTcc(chrom, strand, x + low, x + high) #print goodTcc f.write('%s\n' % goodTcc) f.close()