def plotASProfile(tcc, cName, directory = None, min = 0, extra = "0"): if not directory: fN = extra + '.' + tcc + '.png' else: fN = directory + '/' + extra + '.' + tcc + '.png' #Get S Profile tccStretch = cgPeaks.stretch(tcc, cName) highest = tccStretch.getHighestLevel() if highest < min: return 0 sortedX = tccStretch.profile.keys() sortedX.sort() sortedY = [] for X in sortedX: sortedY.append(tccStretch.profile[X]) #Get AS Profile chr, strand, start, end = tcc.strip().split(':') if strand == '1': strand = '-1' else: strand = '1' tcc = cg.makeTcc(chr, strand, start, end) tccStretchAS = cgPeaks.stretch(tcc, cName) highest = tccStretchAS.getHighestLevel() if highest < min: return 0 #AS can have minimum I guess... sortedXAS = tccStretchAS.profile.keys() sortedXAS.sort() sortedYAS = [] for X in sortedXAS: sortedYAS.append(tccStretchAS.profile[X]) #Plot them gDevice = importr('grDevices') gDevice.png(file=fN, width=1680, height=1050) r('split.screen(c(2,1))') r('screen(1)') r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "(Syn) Expression Level" ) r.lines(sortedX, sortedY, type = "b") r('screen(2)') r.plot(sortedXAS, sortedYAS, xlab = "Coordinates", ylab = "(Anti) Expression Level") r.lines(sortedXAS, sortedYAS, type = "b") gDevice.dev_off()
def profileTargetsHistoAS(tccList, cName, name='boxplot'): range = 50 histDict = {} # {coord: []} histDictAS = {} for tcc in tccList: chrom, strand, start, end = cg.tccSplit(tcc) #Get highest peak (sense) tccStretch = cgPeaks.stretch(tcc, cName) tccStretch.createPeaks(span=2) highestCoord = tccStretch.getHighestPeak() if highestCoord == None: continue #AS tccAS = cg.convertToAS(tcc) tccStretch = cgPeaks.stretch(tccAS, cName) tccStretch.createPeaks(span=2) highestCoordAS = tccStretch.getHighestPeak() if highestCoordAS == None: continue #profile around point (Sense) zPoint = cg.makeTcc(chrom, strand, highestCoord, end) cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio=True) for coord in cProfile: try: histDict[coord].append(cProfile[coord]) except: #quicker way to initialize histDict[coord] = [cProfile[coord]] #profile around point (AS) zPoint = cg.convertToAS(zPoint) cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio=True, ratioCoord=highestCoordAS) for coord in cProfile: try: histDictAS[coord].append(cProfile[coord]) except: #quicker way to initialize histDictAS[coord] = [cProfile[coord]] plot.boxPlotHistoAS(histDict, histDictAS, name=name)
def profileTargetsHistoAS(tccList, cName, name = 'boxplot'): range = 50 histDict = {} # {coord: []} histDictAS = {} for tcc in tccList: chrom, strand, start, end = cg.tccSplit(tcc) #Get highest peak (sense) tccStretch = cgPeaks.stretch(tcc, cName) tccStretch.createPeaks(span = 2) highestCoord = tccStretch.getHighestPeak() if highestCoord == None: continue #AS tccAS = cg.convertToAS(tcc) tccStretch = cgPeaks.stretch(tccAS, cName) tccStretch.createPeaks(span = 2) highestCoordAS = tccStretch.getHighestPeak() if highestCoordAS == None: continue #profile around point (Sense) zPoint = cg.makeTcc(chrom, strand, highestCoord, end) cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio = True) for coord in cProfile: try: histDict[coord].append(cProfile[coord]) except: #quicker way to initialize histDict[coord] = [cProfile[coord]] #profile around point (AS) zPoint = cg.convertToAS(zPoint) cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio = True, ratioCoord = highestCoordAS) for coord in cProfile: try: histDictAS[coord].append(cProfile[coord]) except: #quicker way to initialize histDictAS[coord] = [cProfile[coord]] plot.boxPlotHistoAS(histDict, histDictAS, name = name)
def updateSmallExpression(oFN, cName, rn = None, tn = None): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['eLevel', 'tcc'], [rn, tn]) for oID in oNX.eLevel: stretch = cgPeaks.stretch(oNX.tcc[oID], cName) #this stretch contains values for small library... highValue = stretch.getHighestLevel() oNX.eLevel[oID] = highValue oNX.save()
def updateSmallExpression(aDir, cName): oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() for id, oRNA in id_oRNA.items(): stretch = cgPeaks.stretch(oRNA.tcc, cName) #this stretch contains values for small library... highValue = stretch.getHighestLevel() oRNA.eLevel = highValue oRNA_DC.commit(id_oRNA)
def plotPairs(oDir, aDir, cName): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for oID, oRNA in id_oRNA.items(): if not oRNA.passedFilter: continue for aID in oRNA.filteredTargets: alignment = id_alignment[aID] chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength print sLen print oRNA.sequence print oRNA.tcc print alignment.tTcc if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen print chrom, strand, start, end scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() xVals = range(1, sLen + 2) xVals = sortedKeys yVals = [stretch.profile[x] for x in sortedKeys] print xVals, len(xVals) print yVals, len(yVals) plt.plot(xVals, yVals) plt.show() return 0
def plotSmallDeg(tcc, smallCName, degCName, outDir = None, description = "None", nameNum = "0"): if not outDir: fN = nameNum + "." + tcc + '.png' else: fN = outDir + '/' + nameNum + "." + tcc + '.png' #Get deg Profile tccStretch = cgPeaks.stretch(tcc, degCName) sortedX = tccStretch.profile.keys() sortedX.sort() sortedY = [] for X in sortedX: sortedY.append(tccStretch.profile[X]) #Get small tccStretchSmall = cgPeaks.stretch(tcc, smallCName) sortedXAS = tccStretchSmall.profile.keys() sortedXAS.sort() sortedYAS = [] for X in sortedXAS: sortedYAS.append(tccStretchSmall.profile[X]) #Plot them gDevice = importr('grDevices') gDevice.png(file=fN, width=1680, height=1050) r('split.screen(c(2,1))') r('screen(1)') r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "Degradome Expression" ) r.lines(sortedX, sortedY, type = "b") r('screen(2)') r.plot(sortedXAS, sortedYAS, xlab = description, ylab = "Small Expression") r.lines(sortedXAS, sortedYAS, type = "b") gDevice.dev_off()
def updateSmallExpression(oFN, cName, rn=None, tn=None): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['eLevel', 'tcc'], [rn, tn]) for oID in oNX.eLevel: stretch = cgPeaks.stretch( oNX.tcc[oID], cName) #this stretch contains values for small library... highValue = stretch.getHighestLevel() oNX.eLevel[oID] = highValue oNX.save()
def updateSmallExpression(aDir, cName): oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() for id, oRNA in id_oRNA.items(): stretch = cgPeaks.stretch( oRNA.tcc, cName) #this stretch contains values for small library... highValue = stretch.getHighestLevel() oRNA.eLevel = highValue oRNA_DC.commit(id_oRNA)
def markCenterExpression(aDir, cName): aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): alignment.centerExpression = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] alignment.centerExpression[0] = sum/expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] alignment.centerExpression[1] = sum/expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] alignment.centerExpression[2] = sum/expressionSum aDC.commit(id_alignment)
def markCenterExpression(aDir, cName): aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): alignment.centerExpression = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] alignment.centerExpression[0] = sum / expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] alignment.centerExpression[1] = sum / expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] alignment.centerExpression[2] = sum / expressionSum aDC.commit(id_alignment)
def markCenterExpression(aFN, cName, rn = None, tn = None): aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength'], [rn, tn]) for aID in aNX.centerExpression: aNX.centerExpression[aID] = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID]) offset = aNX.tStart[aID] sLen = aNX.sLength[aID] if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] aNX.centerExpression[aID][0] = sum/expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] aNX.centerExpression[aID][1] = sum/expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] aNX.centerExpression[aID][2] = sum/expressionSum aNX.save()
def markCenterExpression(aFN, cName, rn=None, tn=None): aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength'], [rn, tn]) for aID in aNX.centerExpression: aNX.centerExpression[aID] = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID]) offset = aNX.tStart[aID] sLen = aNX.sLength[aID] if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] aNX.centerExpression[aID][0] = sum / expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] aNX.centerExpression[aID][1] = sum / expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] aNX.centerExpression[aID][2] = sum / expressionSum aNX.save()
def parallelMakePeaks(tcc, cName, minExpression): conf = c.getConfig(cName) f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w') chrom, strand, start, end = cg.tccSplit(tcc) peaks = cgPeaks.stretch(tcc, cName) print 'getting peaks' peaks.createPeaks(span = 1, minVal = int(minExpression)) for x in peaks.peaks: print x newTcc = cg.makeTcc(chrom, strand, x, x + 1) testedPeak = extendPeakTest(newTcc, 20, .2, .05, 0, 6, cName) #testedPeak = roofPeakTest(newTcc, 30, .85, .9, .2, 6, 17, 24, cName) if testedPeak: f.write('%s\n' % testedPeak) f.close()
def profileTargetsHisto(tccList, cName, name='boxplot'): histDict = {} # {coord: []} for tcc in tccList: chrom, strand, start, end = cg.tccSplit(tcc) #Get highest peak tccStretch = cgPeaks.stretch(tcc, cName) tccStretch.createPeaks(span=2) highestCoord = tccStretch.getHighestPeak() if highestCoord == None: continue #profile around point zPoint = cg.makeTcc(chrom, strand, highestCoord, end) cProfile = svs.profileAroundPoint(zPoint, 200, cName, ratio=True) for coord in cProfile: try: histDict[coord].append(cProfile[coord]) except: #quicker way to initialize histDict[coord] = [cProfile[coord]] plot.boxPlotHisto(histDict, name=name)
def plotProfile(tcc, cName, directory = None, min = 0): if not directory: fN = tcc + '.png' else: fN = directory + '/' + tcc + '.png' tccStretch = cgPeaks.stretch(tcc, cName) highest = tccStretch.getHighestLevel() if highest < min: return 0 sortedX = tccStretch.profile.keys() sortedX.sort() sortedY = [] for X in sortedX: sortedY.append(tccStretch.profile[X]) gDevice = importr('grDevices') gDevice.png(file=fN, width=1680, height=1050) r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "Expression Level") r.lines(sortedX, sortedY, type = "b") gDevice.dev_off()
def parallelMakePeaks(tcc, cName, minExpression): conf = c.getConfig(cName) f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w') chrom, strand, start, end = cg.tccSplit(tcc) peaks = cgPeaks.stretch(tcc, cName) print 'getting peaks' peaks.createPeaks(span = 1, minVal = int(minExpression)) for x in peaks.peaks: print "" print chrom, strand, x, newTcc = cg.makeTcc(chrom, strand, x, x + 1) testedPeak = extendPeakTest(newTcc, 20, .2, .05, 0, 6, cName) #testedPeak = roofPeakTest(newTcc, 30, .85, .9, .2, 8, 16, 25, cName) if testedPeak: f.write('%s\n' % testedPeak) f.close()
def profileTargetsHisto(tccList, cName, name = 'boxplot'): histDict = {} # {coord: []} for tcc in tccList: chrom, strand, start, end = cg.tccSplit(tcc) #Get highest peak tccStretch = cgPeaks.stretch(tcc, cName) tccStretch.createPeaks(span = 2) highestCoord = tccStretch.getHighestPeak() if highestCoord == None: continue #profile around point zPoint = cg.makeTcc(chrom, strand, highestCoord, end) cProfile = svs.profileAroundPoint(zPoint, 200, cName, ratio = True) for coord in cProfile: try: histDict[coord].append(cProfile[coord]) except: #quicker way to initialize histDict[coord] = [cProfile[coord]] plot.boxPlotHisto(histDict, name = name)
def parallelMakePeaks(tcc, cName, minExpression): conf = c.getConfig(cName) f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w') print 'scanning range', tcc chrom, strand, start, end = cg.tccSplit(tcc) peaks = cgPeaks.stretch(tcc, cName) #print 'getting peaks' peaks.createPeaks(span = 1, minVal = int(minExpression)) print 'len peaks', len(peaks.peaks) endCheck = 0 for x in peaks.peaks: print x, endCheck ''' if x < endCheck: print 'endChecked' continue ''' #scan a 30 bp range around this point and find the best roof... pRange = 40 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .70 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): print ' ', x + i, cProfile[i] if cProfile[i] > minVal: print ' extending stretch' stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: print 'end of stretch' if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- extend value... val = [1.0, 1.0] extend = 1 if (startFinal) and (endFinal): low = startFinal - extend high = endFinal + extend if low > (1 - pRange) and high < pRange: val[0] = float(cProfile[startFinal - extend]) val[1] = float(cProfile[endFinal + extend]) else: print 'out of range' continue else: print 'no start and end of peak' continue print low, high, x, endFinal endCheck = x + endFinal #avg expression around peak check... #get total expression before peak noiseExpression = 0 lowRange = range(1 - pRange, low) highRange = range(high + 1, pRange) totalLength = len(lowRange) + len(highRange) for i in lowRange: noiseExpression += cProfile[i] for i in highRange: noiseExpression += cProfile[i] avgNoise = noiseExpression/float(totalLength) #filter out peaks that look a certain way. print highest, val[0], val[1], avgNoise if 0 < highest < 5: #rooflength 14/26 if val[0] < 0.20 and val[1] < .20: #drop values if avgNoise < .3: goodTcc = cg.makeTcc(chrom, strand, x + low, x + high) print '*KEEPER' f.write('%s\n' % goodTcc) f.close() print 'DONE', tcc
def makePeakInput(cName, minExpression=2000): mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) f = open('peakData.%s' % minExpression, 'w') for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1', '-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 endCheck = 0 for i in rangePoints(1, chromLens[chrom], 1000): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) #print 'scanning range', tcc peaks = cgPeaks.stretch(tcc, cName) peaks.createPeaks(span=3, minVal=minExpression) for x in peaks.peaks: if x < endCheck: continue #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange) and high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) else: continue else: continue endCheck = x + high #filter out peaks that look a certain way. if 14 < highest < 26: #rooflength if val[0] < 0.2 and val[1] < .2: #drop values goodTcc = cg.makeTcc(chrom, strand, x + low, x + high) #print goodTcc f.write('%s\n' % goodTcc) f.close()
def findPeaks(pType, cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID],'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538':cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc,x,y,dist,ratio,max,highest,val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def markCenterExpressionOLD(smallFN, targetFN, alignmentFN, cName, outFN): #print 'making target dict' #make targetDict f = open(targetFN, 'r') targetDict = {} # tID: tLoc for line in f: ls = line.strip().split('\t') targetDict[int(ls[0])] = ls[1] f.close() #print 'making alignment dict' #make alignmentDict alignDict = {} # sid: {target: offset} f = open(alignmentFN, 'r') for line in f: ls = line.strip().split(' ') sID = int(ls[0]) tID = int(ls[1]) offset = int(ls[4]) if not sID in alignDict: alignDict[sID] = {} alignDict[sID][tID] = offset #assumes one source to target... f.close() f = open(smallFN, 'r') fOut = open(outFN, 'w') for line in f: ls = line.strip().split('\t') sID = int(ls[0]) sLoc = ls[1] sLen = len(sLoc) #This is the sequence for simulated reads... #sLen = bioLibCG.getTccLength(sLoc) #off by one? tIDs = ls[4].split(',') for tID in tIDs: tID = int(tID) tLoc = targetDict[tID] chrom, strand, start, end = bioLibCG.tccSplit(tLoc) offset = alignDict[sID][tID] if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() lowE = 0.0 midE = 0.0 highE = 0.0 if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] lowE = sum / expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] midE = sum / expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] highE = sum / expressionSum fOut.write('%s\t%s\t%s\t%s\t%s\n' % (sID, tID, lowE, midE, highE))
import math import bioLibCG as cg knowns = compare.tccFileToList("mouseKnownMirs.tcc", 0) eLevels = [] for known in knowns: chrom, strand, start, end = cg.tccSplit(known, True) # text... if strand == "1": strand = "-1" else: strand = "1" oppTcc = cg.makeTcc(chrom, strand, start, end) knownStretch = cgPeaks.stretch(known) knownStretch.createPeaks(1, 20) kPos = knownStretch.getHighestPeak() if kPos: eLevels.append(knownStretch.profile[kPos]) oppStretch = cgPeaks.stretch(oppTcc) oppStretch.createPeaks(1, 20) oPos = oppStretch.getHighestPeak() if oPos and kPos: # determine if they are close enough to be considered mirrored... if math.fabs(int(kPos) - int(oPos)) < 12: print known, oPos, kPos, oppStretch.profile[oPos], knownStretch.profile[kPos]
def makeFigure(fN, targetFN, alignmentFN, cName): # make targetDict f = open(targetFN, "r") targetDict = {} # tID: tLoc for line in f: ls = line.strip().split("\t") targetDict[int(ls[0])] = ls[1] f.close() # make alignmentDict alignDict = {} # sid: {target: offset} f = open(alignmentFN, "r") for line in f: ls = line.strip().split(" ") sID = int(ls[0]) tID = int(ls[1]) offset = int(ls[4]) if not sID in alignDict: alignDict[sID] = {} alignDict[sID][tID] = offset # assumes one source to target... f.close() f = open(fN, "r") histoVals = [] for line in f: ls = line.strip().split("\t") sID = int(ls[0]) sLoc = ls[1] sChrom, sStrand, sStart, sEnd = bioLibCG.tccSplit(sLoc) sLen = sEnd - sStart tIDs = ls[4].split(",") for tID in tIDs: tID = int(tID) tLoc = targetDict[tID] chrom, strand, start, end = bioLibCG.tccSplit(tLoc) offset = alignDict[sID][tID] if sStrand == "1": start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) highest = stretch.getHighestLevel() sortedKeys = stretch.profile.keys() if sStrand == "-1": sortedKeys.reverse() i = 0 for key in sortedKeys: level = stretch.profile[key] for j in range(0, level): histoVals.append(i) i += 1 cgPlot.plotHistogram(histoVals)
def updateReadDensity(tType, cName): #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.getConfig(cName) organism = conf.conf['organism'] wigFolder = mainConf.conf['wig%s' % organism] newLines = [] #Differentiate between exon or intron... if tType == 'E': pFileName = conf.conf['resultsExons'] elif tType == 'I': pFileName = conf.conf['resultsIntrons'] else: print 'READ UPDATE FAIL' print ' Updating Read Density:', tType #get read density for each line... print ' calculating hits for mature seqs' #calculate total hits per mature mirFile = open(pFileName, 'r') for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] tccStretch = cgPeaks.stretch(mTcc, cName) highestHit = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): if i in tccStretch.profile: if tccStretch.profile[i] > highestHit: highestHit = tccStretch.profile[i] newLines.append(cg.appendToLine(line, str(highestHit), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE HIGHEST HIT PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r') for line in pFile: predictionCount = int(line.strip().split('\t')[11]) CID = line.strip().split('\t')[7] if CID in clusterCount: if clusterCount[CID] < predictionCount: clusterCount[CID] = predictionCount else: clusterCount[CID] = predictionCount pFile.close() #update the file --> cluster small count newLines = [] predFile = open(pFileName, 'r') for line in predFile: CID = line.strip().split('\t')[7] numMax = clusterCount[CID] newLines.append(cg.appendToLine(line, str(numMax), 12)) predFile.close() #sort newLines by clusterID sortDict = {} CIDs = [] for line in newLines: CID = int(line.strip().split('\t')[7]) if CID not in CIDs: CIDs.append(CID) if CID in sortDict: sortDict[CID].append(line) else: sortDict[CID] = [line] CIDs.sort() newLines = [] for CID in CIDs: for line in sortDict[CID]: newLines.append(line) #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line) newFile.close()
def makePeakInput(cName, minExpression = 2000): mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) f = open('peakData.%s' % minExpression, 'w') for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1', '-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 endCheck = 0 for i in rangePoints(1, chromLens[chrom], 1000): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) #print 'scanning range', tcc peaks = cgPeaks.stretch(tcc, cName) peaks.createPeaks(span = 3, minVal = minExpression) for x in peaks.peaks: if x < endCheck: continue #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange) and high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) else: continue else: continue endCheck = x + high #filter out peaks that look a certain way. if 14 < highest < 26: #rooflength if val[0] < 0.2 and val[1] < .2: #drop values goodTcc = cg.makeTcc(chrom, strand, x + low, x + high) #print goodTcc f.write('%s\n' % goodTcc) f.close()
def markCenterExpressionOLD(smallFN, targetFN, alignmentFN, cName, outFN): #print 'making target dict' #make targetDict f = open(targetFN, 'r') targetDict = {} # tID: tLoc for line in f: ls = line.strip().split('\t') targetDict[int(ls[0])] = ls[1] f.close() #print 'making alignment dict' #make alignmentDict alignDict = {} # sid: {target: offset} f = open(alignmentFN, 'r') for line in f: ls = line.strip().split(' ') sID = int(ls[0]) tID = int(ls[1]) offset = int(ls[4]) if not sID in alignDict: alignDict[sID] = {} alignDict[sID][tID] = offset #assumes one source to target... f.close() f = open(smallFN, 'r') fOut = open(outFN, 'w') for line in f: ls = line.strip().split('\t') sID = int(ls[0]) sLoc = ls[1] sLen = len(sLoc) #This is the sequence for simulated reads... #sLen = bioLibCG.getTccLength(sLoc) #off by one? tIDs = ls[4].split(',') for tID in tIDs: tID = int(tID) tLoc = targetDict[tID] chrom, strand, start, end = bioLibCG.tccSplit(tLoc) offset = alignDict[sID][tID] if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() lowE = 0.0 midE = 0.0 highE = 0.0 if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] lowE = sum/expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] midE = sum/expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] highE = sum/expressionSum fOut.write('%s\t%s\t%s\t%s\t%s\n' % (sID, tID, lowE, midE, highE))
import math import bioLibCG as cg knowns = compare.tccFileToList('mouseKnownMirs.tcc', 0) eLevels = [] for known in knowns: chrom, strand, start, end = cg.tccSplit(known, True) #text... if strand == '1': strand = '-1' else: strand = '1' oppTcc = cg.makeTcc(chrom, strand, start, end) knownStretch = cgPeaks.stretch(known) knownStretch.createPeaks(1, 20) kPos = knownStretch.getHighestPeak() if kPos: eLevels.append(knownStretch.profile[kPos]) oppStretch = cgPeaks.stretch(oppTcc) oppStretch.createPeaks(1, 20) oPos = oppStretch.getHighestPeak() if oPos and kPos: #determine if they are close enough to be considered mirrored... if math.fabs(int(kPos) - int(oPos)) < 12: print known, oPos, kPos, oppStretch.profile[ oPos], knownStretch.profile[kPos] print eLevels
def parallelMakePeaks(tcc, cName, minExpression): conf = c.getConfig(cName) f = open( 'out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w') print 'scanning range', tcc chrom, strand, start, end = cg.tccSplit(tcc) peaks = cgPeaks.stretch(tcc, cName) #print 'getting peaks' peaks.createPeaks(span=1, minVal=int(minExpression)) print 'len peaks', len(peaks.peaks) endCheck = 0 for x in peaks.peaks: print x, endCheck ''' if x < endCheck: print 'endChecked' continue ''' #scan a 30 bp range around this point and find the best roof... pRange = 40 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .70 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): print ' ', x + i, cProfile[i] if cProfile[i] > minVal: print ' extending stretch' stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: print 'end of stretch' if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- extend value... val = [1.0, 1.0] extend = 1 if (startFinal) and (endFinal): low = startFinal - extend high = endFinal + extend if low > (1 - pRange) and high < pRange: val[0] = float(cProfile[startFinal - extend]) val[1] = float(cProfile[endFinal + extend]) else: print 'out of range' continue else: print 'no start and end of peak' continue print low, high, x, endFinal endCheck = x + endFinal #avg expression around peak check... #get total expression before peak noiseExpression = 0 lowRange = range(1 - pRange, low) highRange = range(high + 1, pRange) totalLength = len(lowRange) + len(highRange) for i in lowRange: noiseExpression += cProfile[i] for i in highRange: noiseExpression += cProfile[i] avgNoise = noiseExpression / float(totalLength) #filter out peaks that look a certain way. print highest, val[0], val[1], avgNoise if 0 < highest < 5: #rooflength 14/26 if val[0] < 0.20 and val[1] < .20: #drop values if avgNoise < .3: goodTcc = cg.makeTcc(chrom, strand, x + low, x + high) print '*KEEPER' f.write('%s\n' % goodTcc) f.close() print 'DONE', tcc
def makeFigure(fN, targetFN, alignmentFN, cName): #make targetDict f = open(targetFN, 'r') targetDict = {} # tID: tLoc for line in f: ls = line.strip().split('\t') targetDict[int(ls[0])] = ls[1] f.close() #make alignmentDict alignDict = {} # sid: {target: offset} f = open(alignmentFN, 'r') for line in f: ls = line.strip().split(' ') sID = int(ls[0]) tID = int(ls[1]) offset = int(ls[4]) if not sID in alignDict: alignDict[sID] = {} alignDict[sID][tID] = offset #assumes one source to target... f.close() f = open(fN, 'r') histoVals = [] for line in f: ls = line.strip().split('\t') sID = int(ls[0]) sLoc = ls[1] sChrom, sStrand, sStart, sEnd = bioLibCG.tccSplit(sLoc) sLen = sEnd - sStart tIDs = ls[4].split(',') for tID in tIDs: tID = int(tID) tLoc = targetDict[tID] chrom, strand, start, end = bioLibCG.tccSplit(tLoc) offset = alignDict[sID][tID] if sStrand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) highest = stretch.getHighestLevel() sortedKeys = stretch.profile.keys() if sStrand == '-1': sortedKeys.reverse() i = 0 for key in sortedKeys: level = stretch.profile[key] for j in range(0,level): histoVals.append(i) i += 1 cgPlot.plotHistogram(histoVals)
def findPeaks(pType, cName=None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID], 'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538': cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio=False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc, x, y, dist, ratio, max, highest, val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < ( math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % ( str(peakDict[CID][1][1])[-3:], 'S', str( peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5], peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()