示例#1
0
def plotASProfile(tcc, cName, directory = None, min = 0, extra = "0"):
	if not directory:
		fN = extra + '.' + tcc + '.png'
	else:
		fN = directory + '/' + extra + '.' + tcc + '.png'
	
	#Get S Profile
	tccStretch = cgPeaks.stretch(tcc, cName)
	highest = tccStretch.getHighestLevel()
	if highest < min:
		return 0
		
	sortedX = tccStretch.profile.keys()
	sortedX.sort()
	
	sortedY = []
	for X in sortedX:
		sortedY.append(tccStretch.profile[X])
	
	#Get AS Profile
	chr, strand, start, end = tcc.strip().split(':')
	if strand == '1':
		strand = '-1'
	else:
		strand = '1'
	tcc = cg.makeTcc(chr, strand, start, end)
	
	tccStretchAS = cgPeaks.stretch(tcc, cName)
	highest = tccStretchAS.getHighestLevel()
	if highest < min:
		return 0 #AS can have minimum I guess...
		
	sortedXAS = tccStretchAS.profile.keys()
	sortedXAS.sort()
	
	sortedYAS = []
	for X in sortedXAS:
		sortedYAS.append(tccStretchAS.profile[X])
	
	#Plot them
	gDevice = importr('grDevices')
	gDevice.png(file=fN, width=1680, height=1050)
	r('split.screen(c(2,1))')
	r('screen(1)')
	r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "(Syn) Expression Level" )
	r.lines(sortedX, sortedY, type = "b")
	r('screen(2)')
	r.plot(sortedXAS, sortedYAS, xlab = "Coordinates", ylab = "(Anti) Expression Level")
	r.lines(sortedXAS, sortedYAS, type = "b")
	gDevice.dev_off()
示例#2
0
def profileTargetsHistoAS(tccList, cName, name='boxplot'):

    range = 50
    histDict = {}  # {coord: []}
    histDictAS = {}
    for tcc in tccList:

        chrom, strand, start, end = cg.tccSplit(tcc)
        #Get highest peak (sense)
        tccStretch = cgPeaks.stretch(tcc, cName)
        tccStretch.createPeaks(span=2)
        highestCoord = tccStretch.getHighestPeak()
        if highestCoord == None: continue

        #AS
        tccAS = cg.convertToAS(tcc)
        tccStretch = cgPeaks.stretch(tccAS, cName)
        tccStretch.createPeaks(span=2)
        highestCoordAS = tccStretch.getHighestPeak()
        if highestCoordAS == None: continue

        #profile around point (Sense)
        zPoint = cg.makeTcc(chrom, strand, highestCoord, end)
        cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio=True)

        for coord in cProfile:
            try:
                histDict[coord].append(cProfile[coord])
            except:  #quicker way to initialize
                histDict[coord] = [cProfile[coord]]

        #profile around point (AS)
        zPoint = cg.convertToAS(zPoint)
        cProfile = svs.profileAroundPoint(zPoint,
                                          range,
                                          cName,
                                          ratio=True,
                                          ratioCoord=highestCoordAS)

        for coord in cProfile:
            try:
                histDictAS[coord].append(cProfile[coord])
            except:  #quicker way to initialize
                histDictAS[coord] = [cProfile[coord]]

    plot.boxPlotHistoAS(histDict, histDictAS, name=name)
def profileTargetsHistoAS(tccList, cName, name = 'boxplot'):
	
	range = 50
	histDict = {} # {coord: []}
	histDictAS = {}
	for tcc in tccList:
		
		chrom, strand, start, end = cg.tccSplit(tcc)
		#Get highest peak (sense)
		tccStretch = cgPeaks.stretch(tcc, cName)
		tccStretch.createPeaks(span = 2)
		highestCoord = tccStretch.getHighestPeak()
		if highestCoord == None: continue
		
		#AS
		tccAS = cg.convertToAS(tcc)
		tccStretch = cgPeaks.stretch(tccAS, cName)
		tccStretch.createPeaks(span = 2)
		highestCoordAS = tccStretch.getHighestPeak()
		if highestCoordAS == None: continue
		
		#profile around point (Sense)
		zPoint = cg.makeTcc(chrom, strand, highestCoord, end)
		cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio = True)
		
		for coord in cProfile:
			try:
				histDict[coord].append(cProfile[coord])
			except: #quicker way to initialize
				histDict[coord] = [cProfile[coord]]
	
		#profile around point (AS)
		zPoint = cg.convertToAS(zPoint)
		cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio = True, ratioCoord = highestCoordAS)
		
		for coord in cProfile:
			try:
				histDictAS[coord].append(cProfile[coord])
			except: #quicker way to initialize
				histDictAS[coord] = [cProfile[coord]]
	
	plot.boxPlotHistoAS(histDict, histDictAS, name = name)
def updateSmallExpression(oFN, cName, rn = None, tn = None):
	
        oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
        oNX.load(['eLevel', 'tcc'], [rn, tn])

        for oID in oNX.eLevel:

	        stretch = cgPeaks.stretch(oNX.tcc[oID], cName) #this stretch contains values for small library...
	        highValue = stretch.getHighestLevel()
	        oNX.eLevel[oID] = highValue

        oNX.save()        
def updateSmallExpression(aDir, cName):
	
	oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
	id_oRNA = oRNA_DC.load()
	
        for id, oRNA in id_oRNA.items():

	        stretch = cgPeaks.stretch(oRNA.tcc, cName) #this stretch contains values for small library...
	        highValue = stretch.getHighestLevel()
	        oRNA.eLevel = highValue

        
        oRNA_DC.commit(id_oRNA)
def plotPairs(oDir, aDir, cName):

        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()

        for oID, oRNA in id_oRNA.items():
                
                if not oRNA.passedFilter:
                        continue

                for aID in oRNA.filteredTargets:

                        alignment = id_alignment[aID]
                        chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
                        offset = alignment.tStart
                        sLen = alignment.sLength
                        print sLen
                        print oRNA.sequence
                        print oRNA.tcc
                        print alignment.tTcc
                        if strand == '1':
                                start = start - 19 + offset
                                end = start + sLen
                        else:
                                end = end + 19 - offset
                                start = end - sLen

                        print chrom, strand, start, end
                        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                        
                        stretch = cgPeaks.stretch(scanRange, cName)
                        sortedKeys = stretch.profile.keys()
                        sortedKeys.sort()

                        if strand == '-1':
                                sortedKeys.reverse()
                        

                        xVals = range(1, sLen + 2)
                        xVals = sortedKeys
                        yVals = [stretch.profile[x] for x in sortedKeys]
                        print xVals, len(xVals)
                        print yVals, len(yVals)
                        
                        plt.plot(xVals, yVals)
                        plt.show()

                        return 0
示例#7
0
def plotSmallDeg(tcc, smallCName, degCName, outDir = None, description = "None", nameNum = "0"):
	
        if not outDir:
		fN = nameNum + "." + tcc + '.png'
	else:
		fN = outDir + '/' + nameNum + "." + tcc + '.png'
	
        #Get deg Profile
	tccStretch = cgPeaks.stretch(tcc, degCName)
		
	sortedX = tccStretch.profile.keys()                                                                                                     
	sortedX.sort()
	
	sortedY = []
	for X in sortedX:
		sortedY.append(tccStretch.profile[X])
	
	#Get small
	tccStretchSmall = cgPeaks.stretch(tcc, smallCName)
		
	sortedXAS = tccStretchSmall.profile.keys()
	sortedXAS.sort()
	
	sortedYAS = []
	for X in sortedXAS:
		sortedYAS.append(tccStretchSmall.profile[X])
	
	#Plot them
	gDevice = importr('grDevices')
	gDevice.png(file=fN, width=1680, height=1050)
	r('split.screen(c(2,1))')
	r('screen(1)')
	r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "Degradome Expression" )
	r.lines(sortedX, sortedY, type = "b")
	r('screen(2)')
	r.plot(sortedXAS, sortedYAS, xlab = description, ylab = "Small Expression")
	r.lines(sortedXAS, sortedYAS, type = "b")
	gDevice.dev_off()
def plotPairs(oDir, aDir, cName):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    for oID, oRNA in id_oRNA.items():

        if not oRNA.passedFilter:
            continue

        for aID in oRNA.filteredTargets:

            alignment = id_alignment[aID]
            chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
            offset = alignment.tStart
            sLen = alignment.sLength
            print sLen
            print oRNA.sequence
            print oRNA.tcc
            print alignment.tTcc
            if strand == '1':
                start = start - 19 + offset
                end = start + sLen
            else:
                end = end + 19 - offset
                start = end - sLen

            print chrom, strand, start, end
            scanRange = bioLibCG.makeTcc(chrom, strand, start, end)

            stretch = cgPeaks.stretch(scanRange, cName)
            sortedKeys = stretch.profile.keys()
            sortedKeys.sort()

            if strand == '-1':
                sortedKeys.reverse()

            xVals = range(1, sLen + 2)
            xVals = sortedKeys
            yVals = [stretch.profile[x] for x in sortedKeys]
            print xVals, len(xVals)
            print yVals, len(yVals)

            plt.plot(xVals, yVals)
            plt.show()

            return 0
示例#9
0
def updateSmallExpression(oFN, cName, rn=None, tn=None):

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['eLevel', 'tcc'], [rn, tn])

    for oID in oNX.eLevel:

        stretch = cgPeaks.stretch(
            oNX.tcc[oID],
            cName)  #this stretch contains values for small library...
        highValue = stretch.getHighestLevel()
        oNX.eLevel[oID] = highValue

    oNX.save()
示例#10
0
def updateSmallExpression(aDir, cName):

    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = oRNA_DC.load()

    for id, oRNA in id_oRNA.items():

        stretch = cgPeaks.stretch(
            oRNA.tcc,
            cName)  #this stretch contains values for small library...
        highValue = stretch.getHighestLevel()
        oRNA.eLevel = highValue

    oRNA_DC.commit(id_oRNA)
def markCenterExpression(aDir, cName):
        
        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()


        for alignment in id_alignment.values():
                alignment.centerExpression = [0.0, 0.0, 0.0]      
                chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                
                stretch = cgPeaks.stretch(scanRange, cName)
                expressionSum = stretch.getSumOfLevels()
                sortedKeys = stretch.profile.keys()
                sortedKeys.sort()

                if strand == '-1':
                        sortedKeys.reverse()
                
                if expressionSum != 0:

                        sum = 0.0
                        for key in sortedKeys[8:12]:
                                sum += stretch.profile[key]
                        alignment.centerExpression[0] = sum/expressionSum

                        sum = 0.0
                        for key in sortedKeys[7:13]:
                                sum += stretch.profile[key]
                        alignment.centerExpression[1] = sum/expressionSum

                        sum = 0.0
                        for key in sortedKeys[6:14]:
                                sum += stretch.profile[key]
                        alignment.centerExpression[2] = sum/expressionSum

        aDC.commit(id_alignment)
示例#12
0
def markCenterExpression(aDir, cName):

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    for alignment in id_alignment.values():
        alignment.centerExpression = [0.0, 0.0, 0.0]
        chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)

        stretch = cgPeaks.stretch(scanRange, cName)
        expressionSum = stretch.getSumOfLevels()
        sortedKeys = stretch.profile.keys()
        sortedKeys.sort()

        if strand == '-1':
            sortedKeys.reverse()

        if expressionSum != 0:

            sum = 0.0
            for key in sortedKeys[8:12]:
                sum += stretch.profile[key]
            alignment.centerExpression[0] = sum / expressionSum

            sum = 0.0
            for key in sortedKeys[7:13]:
                sum += stretch.profile[key]
            alignment.centerExpression[1] = sum / expressionSum

            sum = 0.0
            for key in sortedKeys[6:14]:
                sum += stretch.profile[key]
            alignment.centerExpression[2] = sum / expressionSum

    aDC.commit(id_alignment)
def markCenterExpression(aFN, cName, rn = None, tn = None):
        
        aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
        aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength'], [rn, tn])

        for aID in aNX.centerExpression:
                aNX.centerExpression[aID] = [0.0, 0.0, 0.0]      
                chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID])
                offset = aNX.tStart[aID]
                sLen = aNX.sLength[aID]

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                
                stretch = cgPeaks.stretch(scanRange, cName)
                expressionSum = stretch.getSumOfLevels()
                sortedKeys = stretch.profile.keys()
                sortedKeys.sort()

                if strand == '-1':
                        sortedKeys.reverse()
                
                if expressionSum != 0:

                        sum = 0.0
                        for key in sortedKeys[8:12]:
                                sum += stretch.profile[key]
                        aNX.centerExpression[aID][0] = sum/expressionSum

                        sum = 0.0
                        for key in sortedKeys[7:13]:
                                sum += stretch.profile[key]
                        aNX.centerExpression[aID][1] = sum/expressionSum

                        sum = 0.0
                        for key in sortedKeys[6:14]:
                                sum += stretch.profile[key]
                        aNX.centerExpression[aID][2] = sum/expressionSum

        aNX.save()
def markCenterExpression(aFN, cName, rn=None, tn=None):

    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength'], [rn, tn])

    for aID in aNX.centerExpression:
        aNX.centerExpression[aID] = [0.0, 0.0, 0.0]
        chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID])
        offset = aNX.tStart[aID]
        sLen = aNX.sLength[aID]

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)

        stretch = cgPeaks.stretch(scanRange, cName)
        expressionSum = stretch.getSumOfLevels()
        sortedKeys = stretch.profile.keys()
        sortedKeys.sort()

        if strand == '-1':
            sortedKeys.reverse()

        if expressionSum != 0:

            sum = 0.0
            for key in sortedKeys[8:12]:
                sum += stretch.profile[key]
            aNX.centerExpression[aID][0] = sum / expressionSum

            sum = 0.0
            for key in sortedKeys[7:13]:
                sum += stretch.profile[key]
            aNX.centerExpression[aID][1] = sum / expressionSum

            sum = 0.0
            for key in sortedKeys[6:14]:
                sum += stretch.profile[key]
            aNX.centerExpression[aID][2] = sum / expressionSum

    aNX.save()
示例#15
0
def parallelMakePeaks(tcc, cName, minExpression):
	conf = c.getConfig(cName)
	f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w')
	chrom, strand, start, end = cg.tccSplit(tcc)
        peaks = cgPeaks.stretch(tcc, cName)
		
	print 'getting peaks'
	peaks.createPeaks(span = 1, minVal = int(minExpression))
        	
	for x in peaks.peaks:

		print x
                
                newTcc = cg.makeTcc(chrom, strand, x, x + 1)
                testedPeak = extendPeakTest(newTcc, 20, .2, .05, 0, 6, cName) 
                #testedPeak = roofPeakTest(newTcc, 30, .85, .9, .2, 6, 17, 24, cName)

                if testedPeak:
                        f.write('%s\n' % testedPeak)
	

	f.close()
示例#16
0
def profileTargetsHisto(tccList, cName, name='boxplot'):

    histDict = {}  # {coord: []}
    for tcc in tccList:

        chrom, strand, start, end = cg.tccSplit(tcc)
        #Get highest peak
        tccStretch = cgPeaks.stretch(tcc, cName)
        tccStretch.createPeaks(span=2)
        highestCoord = tccStretch.getHighestPeak()
        if highestCoord == None: continue

        #profile around point
        zPoint = cg.makeTcc(chrom, strand, highestCoord, end)
        cProfile = svs.profileAroundPoint(zPoint, 200, cName, ratio=True)

        for coord in cProfile:
            try:
                histDict[coord].append(cProfile[coord])
            except:  #quicker way to initialize
                histDict[coord] = [cProfile[coord]]

    plot.boxPlotHisto(histDict, name=name)
示例#17
0
def plotProfile(tcc, cName, directory = None, min = 0):
	if not directory:
		fN = tcc + '.png'
	else:
		fN = directory + '/' + tcc + '.png'
		
	tccStretch = cgPeaks.stretch(tcc, cName)
	highest = tccStretch.getHighestLevel()
	if highest < min:
		return 0
		
	sortedX = tccStretch.profile.keys()
	sortedX.sort()
	
	sortedY = []
	for X in sortedX:
		sortedY.append(tccStretch.profile[X])
	
	gDevice = importr('grDevices')
	gDevice.png(file=fN, width=1680, height=1050)
	r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "Expression Level")
	r.lines(sortedX, sortedY, type = "b")
	gDevice.dev_off()
示例#18
0
def parallelMakePeaks(tcc, cName, minExpression):
	conf = c.getConfig(cName)
	f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w')
	chrom, strand, start, end = cg.tccSplit(tcc)
        peaks = cgPeaks.stretch(tcc, cName)
		
	print 'getting peaks'
	peaks.createPeaks(span = 1, minVal = int(minExpression))
        	
	for x in peaks.peaks:
                
                print ""
		print chrom, strand, x,
                
                newTcc = cg.makeTcc(chrom, strand, x, x + 1)
                testedPeak = extendPeakTest(newTcc, 20, .2, .05, 0, 6, cName) 
                #testedPeak = roofPeakTest(newTcc, 30, .85, .9, .2, 8, 16, 25, cName)

                if testedPeak:
                        f.write('%s\n' % testedPeak)
	

	f.close()
示例#19
0
def profileTargetsHisto(tccList, cName, name = 'boxplot'):
		
	histDict = {} # {coord: []}
	for tcc in tccList:
		
		chrom, strand, start, end = cg.tccSplit(tcc)
		#Get highest peak
		tccStretch = cgPeaks.stretch(tcc, cName)
		tccStretch.createPeaks(span = 2)
		highestCoord = tccStretch.getHighestPeak()
		if highestCoord == None: continue
		
		#profile around point
		zPoint = cg.makeTcc(chrom, strand, highestCoord, end)
		cProfile = svs.profileAroundPoint(zPoint, 200, cName, ratio = True)
		
		for coord in cProfile:
			try:
				histDict[coord].append(cProfile[coord])
			except: #quicker way to initialize
				histDict[coord] = [cProfile[coord]]
	
	
	plot.boxPlotHisto(histDict, name = name)
def parallelMakePeaks(tcc, cName, minExpression):
	conf = c.getConfig(cName)
	f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w')
	print 'scanning range', tcc
	chrom, strand, start, end = cg.tccSplit(tcc)
	peaks = cgPeaks.stretch(tcc, cName)
	
	
	
	#print 'getting peaks'
	peaks.createPeaks(span = 1, minVal = int(minExpression))
	
	print 'len peaks', len(peaks.peaks)
	endCheck = 0
	for x in peaks.peaks:
		print x, endCheck
                
                '''
		if x < endCheck:
                        print 'endChecked'
			continue
	        '''

		#scan a 30 bp range around this point and find the best roof...
		pRange = 40
		rTcc = cg.makeTcc(chrom, strand, x, x + 1)
		

		#now make profile for roof...
		cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True)
		
		#now get highest stretch length and the rNext coord.
		minVal = .70
		highest = 0
		stretch = 0
		startCurrent = None
		startFinal = None
		endFinal = None
		for i in range(1 - pRange, pRange):
                        print ' ', x + i, cProfile[i] 
			if cProfile[i] > minVal:
				print '  extending stretch'
                                stretch += 1
				if startCurrent == None:
					startCurrent = i
			else:
				if stretch > 0:
					print 'end of stretch'
                                        if stretch > highest: #stretch ended and was higher than previous
						highest = stretch
						endFinal = i - 1
						startFinal = startCurrent
						startCurrent = None
					else:
						startCurrent = None
				stretch = 0
		
		#get +/- extend value...
		val = [1.0, 1.0]
                extend = 1
		if (startFinal) and (endFinal):
			low = startFinal - extend
			high = endFinal + extend
			if low > (1 - pRange) and high < pRange:
					val[0] = float(cProfile[startFinal - extend])
					val[1] = float(cProfile[endFinal + extend])
			else:
                                print 'out of range'
				continue
		else:
                        print 'no start and end of peak'
			continue
	        print low, high, x, endFinal
		endCheck = x + endFinal
		
                #avg expression around peak check...
                #get total expression before peak
                noiseExpression = 0
                lowRange = range(1 - pRange, low)
                highRange = range(high + 1, pRange) 
                totalLength = len(lowRange) + len(highRange)
                for i in lowRange:
                        noiseExpression += cProfile[i]
                for i in highRange:
                        noiseExpression += cProfile[i]
                avgNoise = noiseExpression/float(totalLength)


		#filter out peaks that look a certain way.
                print highest, val[0], val[1], avgNoise
		if 0 < highest < 5: #rooflength 14/26
			if val[0] < 0.20 and val[1] < .20: #drop values
                                if avgNoise < .3:
                                        goodTcc = cg.makeTcc(chrom, strand, x + low, x + high)
				        print '*KEEPER'
				        f.write('%s\n' % goodTcc)
	

	f.close()
	print 'DONE', tcc
示例#21
0
def makePeakInput(cName, minExpression=2000):

    mConf = c.getConfig('Main.conf')
    conf = c.getConfig(cName)

    assembly = conf.conf['assembly']

    tccList = []

    chromLens = cg.returnChromLengthDict(assembly)
    f = open('peakData.%s' % minExpression, 'w')
    for chrom in chromLens:
        if chrom not in cg.acceptableChroms: continue
        for strand in ['1', '-1']:
            print 'Getting Peaks for ', chrom, strand
            prevI = 0
            endCheck = 0
            for i in rangePoints(1, chromLens[chrom], 1000):
                if i == 1:
                    prevI = i
                    continue

                start = prevI
                end = i
                prevI = i

                tcc = cg.makeTcc(chrom, strand, start, end)
                #print 'scanning range', tcc
                peaks = cgPeaks.stretch(tcc, cName)
                peaks.createPeaks(span=3, minVal=minExpression)

                for x in peaks.peaks:

                    if x < endCheck:
                        continue

                    #scan a 30 bp range around this point and find the best roof...
                    pRange = 30
                    rTcc = cg.makeTcc(chrom, strand, x, x + 1)

                    #now make profile for roof...
                    cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                                 pRange,
                                                                 cName,
                                                                 ratio=True)

                    #now get highest stretch length and the rNext coord.
                    minVal = .80
                    highest = 0
                    stretch = 0
                    startCurrent = None
                    startFinal = None
                    endFinal = None
                    for i in range(1 - pRange, pRange):
                        if cProfile[i] > minVal:
                            stretch += 1
                            if startCurrent == None:
                                startCurrent = i
                        else:
                            if stretch > 0:
                                if stretch > highest:  #stretch ended and was higher than previous
                                    highest = stretch
                                    endFinal = i - 1
                                    startFinal = startCurrent
                                    startCurrent = None
                                else:
                                    startCurrent = None
                            stretch = 0

                    #get +/- 4 value...
                    val = [1.0, 1.0]
                    if (startFinal) and (endFinal):
                        low = startFinal - 4
                        high = endFinal + 4
                        if low > (1 - pRange) and high < pRange:
                            val[0] = float(cProfile[startFinal - 4])
                            val[1] = float(cProfile[endFinal + 4])
                        else:
                            continue
                    else:
                        continue

                    endCheck = x + high

                    #filter out peaks that look a certain way.
                    if 14 < highest < 26:  #rooflength
                        if val[0] < 0.2 and val[1] < .2:  #drop values
                            goodTcc = cg.makeTcc(chrom, strand, x + low,
                                                 x + high)
                            #print goodTcc
                            f.write('%s\n' % goodTcc)
    f.close()
def findPeaks(pType, cName = None):
	
	#init
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)

	if pType == 'E':
		predName = conf.conf['resultsExonsSorted']
	else:
		predName = conf.conf['resultsIntronsSorted']
	
	print predName
	#make CID:hairpin:peak dictionary
	cHairs = getHairpins.getHairpins(predName)
	peakDict = {}
	for CID in cHairs:
		peakDict[CID] = [cHairs[CID],'None']
		

	timer = cg.cgTimer()
	timer.start()

	#put peaks in memory
	print 'Creating peak data'
	peaks = {} # chr:peak:value
	for CID in cHairs:
		chrom, strand, start, end = cg.tccSplit(cHairs[CID])
		tcc = cHairs[CID]
		
		#init dictionary
		if chrom not in peaks:
			peaks[chrom] = {}
		
		if strand not in peaks[chrom]:
			peaks[chrom][strand] = {}
		
		#create peaks for tcc and add to peak dictionary
		stretch = cgPeaks.stretch(tcc, cName)
		stretch.createPeaks()
		for peakCoord in stretch.peaks:
			peaks[chrom][strand][peakCoord] = 0
	print timer.split()

	print 'finding best combos'
	bestCombos = []
	aPass = 0
	bPass = 0
	cPass = 0
	numT = 0
	for CID in peakDict:
		cgFlag = False
		if CID == '538':cgFlag = True
		
		tcc = peakDict[CID][0]
		#print tcc
		tccPeaks = []
		chrom = cg.ss(tcc, ':')[0]
		strand = cg.ss(tcc, ':')[1]
		start = int(cg.ss(tcc, ':')[2])
		end = int(cg.ss(tcc, ':')[3])
		
		#get all peaks
		for i in range(start, end + 1):
			if i in peaks[chrom][strand]:
				#print '  peak added', i
				tccPeaks.append(i)
		
		#Calculate parameters...
		pairStrings = [] #used to check if pair already added
		peakCombos = []
		for x in tccPeaks:
				
								
				#scan a 30 bp range around this point and find the best roof...
				pRange = 30
				rTcc = cg.makeTcc(chrom, strand, x, x + 1)
				
				#quickly get max value...kinda a long way to do it but whatever
				cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False)
				xval = cProfile[0]
				max = xval
				highestValueCoord = x
				
				#now make profile for roof...
				cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True)
				
				
				
				#now get highest stretch length and the rNext coord.
				minVal = .80
				highest = 0
				stretch = 0
				startCurrent = None
				startFinal = None
				endFinal = None
				for i in range(1 - pRange, pRange):
					if cProfile[i] > minVal:
						stretch += 1
						if startCurrent == None:
							startCurrent = i
					else:
						if stretch > 0:
							if stretch > highest: #stretch ended and was higher than previous
								highest = stretch
								endFinal = i - 1
								startFinal = startCurrent
								startCurrent = None
							else:
								startCurrent = None
						stretch = 0
				
				#get +/- 4 value...
				val = [1.0, 1.0]
				if (startFinal) and (endFinal):
					low = startFinal - 4
					high = endFinal + 4
					if low > (1 - pRange):
						if high < pRange:
							val[0] = float(cProfile[startFinal - 4])
							val[1] = float(cProfile[endFinal + 4])
				
				#fill in other details...
				y = 'S'
				dist = 'S'
				ratio = 'S'
				
				peakCombos.append([tcc,x,y,dist,ratio,max,highest,val])
				#print '  ', peakCombos[-1]
		
		#find best combo...
		topCombo = None
		for combo in peakCombos:
			roofLength = combo[6]
			dropValue = combo[7][0]
			if combo[7][1] > dropValue:
				dropValue = combo[7][1]
			
			#print roofLength, dropValue
			if 14 < roofLength < 26:
				if 0.0 < dropValue < 0.2:
					#pick one with rooflength nearest 20:
					if topCombo:
						if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])):
							topCombo = combo
					else:
						topCombo = combo
		
		if topCombo:
			peakDict[CID][1] = topCombo
			bestCombos.append(topCombo)
			print bestCombos[-1]
		else:
			#print 'None'
			pass

	print timer.split()


	#now update predFile (SLOT 13)
	predFile = open(predName, 'r')
	newLines = []
	for line in predFile:
		CID = cg.ss(line)[7]
		if peakDict[CID][1] == 'None':
			peakInfo = 'None'
		else:
			peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7])
		newLines.append(cg.appendToLine(line, peakInfo, 13))
	predFile.close()

	predFile = open(predName, 'w')
	predFile.writelines(newLines)
	predFile.close()
示例#23
0
def markCenterExpressionOLD(smallFN, targetFN, alignmentFN, cName, outFN):

    #print 'making target dict'
    #make targetDict
    f = open(targetFN, 'r')
    targetDict = {}  # tID: tLoc
    for line in f:
        ls = line.strip().split('\t')
        targetDict[int(ls[0])] = ls[1]
    f.close()

    #print 'making alignment dict'
    #make alignmentDict
    alignDict = {}  # sid: {target: offset}
    f = open(alignmentFN, 'r')
    for line in f:
        ls = line.strip().split(' ')
        sID = int(ls[0])
        tID = int(ls[1])
        offset = int(ls[4])
        if not sID in alignDict:
            alignDict[sID] = {}

        alignDict[sID][tID] = offset  #assumes one source to target...
    f.close()

    f = open(smallFN, 'r')
    fOut = open(outFN, 'w')

    for line in f:
        ls = line.strip().split('\t')
        sID = int(ls[0])
        sLoc = ls[1]
        sLen = len(sLoc)  #This is the sequence for simulated reads...
        #sLen = bioLibCG.getTccLength(sLoc) #off by one?
        tIDs = ls[4].split(',')

        for tID in tIDs:
            tID = int(tID)
            tLoc = targetDict[tID]
            chrom, strand, start, end = bioLibCG.tccSplit(tLoc)
            offset = alignDict[sID][tID]

            if strand == '1':
                start = start - 19 + offset
                end = start + sLen
            else:
                end = end + 19 - offset
                start = end - sLen

            scanRange = bioLibCG.makeTcc(chrom, strand, start, end)

            stretch = cgPeaks.stretch(scanRange, cName)
            expressionSum = stretch.getSumOfLevels()
            sortedKeys = stretch.profile.keys()
            sortedKeys.sort()

            if strand == '-1':
                sortedKeys.reverse()

            lowE = 0.0
            midE = 0.0
            highE = 0.0

            if expressionSum != 0:

                sum = 0.0
                for key in sortedKeys[8:12]:
                    sum += stretch.profile[key]
                lowE = sum / expressionSum

                sum = 0.0
                for key in sortedKeys[7:13]:
                    sum += stretch.profile[key]
                midE = sum / expressionSum

                sum = 0.0
                for key in sortedKeys[6:14]:
                    sum += stretch.profile[key]
                highE = sum / expressionSum

            fOut.write('%s\t%s\t%s\t%s\t%s\n' % (sID, tID, lowE, midE, highE))
示例#24
0
import math
import bioLibCG as cg

knowns = compare.tccFileToList("mouseKnownMirs.tcc", 0)

eLevels = []
for known in knowns:

    chrom, strand, start, end = cg.tccSplit(known, True)  # text...
    if strand == "1":
        strand = "-1"
    else:
        strand = "1"
    oppTcc = cg.makeTcc(chrom, strand, start, end)

    knownStretch = cgPeaks.stretch(known)
    knownStretch.createPeaks(1, 20)
    kPos = knownStretch.getHighestPeak()
    if kPos:
        eLevels.append(knownStretch.profile[kPos])

    oppStretch = cgPeaks.stretch(oppTcc)
    oppStretch.createPeaks(1, 20)
    oPos = oppStretch.getHighestPeak()

    if oPos and kPos:
        # determine if they are close enough to be considered mirrored...
        if math.fabs(int(kPos) - int(oPos)) < 12:
            print known, oPos, kPos, oppStretch.profile[oPos], knownStretch.profile[kPos]

def makeFigure(fN, targetFN, alignmentFN, cName):
    # make targetDict
    f = open(targetFN, "r")
    targetDict = {}  # tID: tLoc
    for line in f:
        ls = line.strip().split("\t")
        targetDict[int(ls[0])] = ls[1]
    f.close()

    # make alignmentDict
    alignDict = {}  # sid: {target: offset}
    f = open(alignmentFN, "r")
    for line in f:
        ls = line.strip().split(" ")
        sID = int(ls[0])
        tID = int(ls[1])
        offset = int(ls[4])
        if not sID in alignDict:
            alignDict[sID] = {}

        alignDict[sID][tID] = offset  # assumes one source to target...
    f.close()

    f = open(fN, "r")

    histoVals = []

    for line in f:
        ls = line.strip().split("\t")
        sID = int(ls[0])
        sLoc = ls[1]
        sChrom, sStrand, sStart, sEnd = bioLibCG.tccSplit(sLoc)
        sLen = sEnd - sStart
        tIDs = ls[4].split(",")

        for tID in tIDs:
            tID = int(tID)
            tLoc = targetDict[tID]
            chrom, strand, start, end = bioLibCG.tccSplit(tLoc)
            offset = alignDict[sID][tID]

            if sStrand == "1":
                start = start - 19 + offset
                end = start + sLen
            else:
                end = end + 19 - offset
                start = end - sLen

            scanRange = bioLibCG.makeTcc(chrom, strand, start, end)

            stretch = cgPeaks.stretch(scanRange, cName)
            highest = stretch.getHighestLevel()
            sortedKeys = stretch.profile.keys()

            if sStrand == "-1":
                sortedKeys.reverse()

            i = 0
            for key in sortedKeys:
                level = stretch.profile[key]
                for j in range(0, level):
                    histoVals.append(i)
                i += 1

    cgPlot.plotHistogram(histoVals)
def updateReadDensity(tType, cName):
	#go through wig each chromosome and check the mature seqs
	mainConf = cgConfig.cgConfig('Main.conf')
	conf = cgConfig.getConfig(cName)
	organism = conf.conf['organism']
	wigFolder = mainConf.conf['wig%s' % organism]	
	newLines = []
	
	
	#Differentiate between exon or intron...
	if tType == 'E':
		pFileName = conf.conf['resultsExons']
	elif tType == 'I':
		pFileName = conf.conf['resultsIntrons']
	else:
		print 'READ UPDATE FAIL'

	print '  Updating Read Density:', tType

	
	#get read density for each line...
	print '  calculating hits for mature seqs'
	#calculate total hits per mature
	mirFile = open(pFileName, 'r')
	for line in mirFile:
		mTcc = line.strip().split('\t')[1]
		mirID = line.strip().split('\t')[0]
		
		tccStretch = cgPeaks.stretch(mTcc, cName)
		highestHit = 0
		for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
			if i in tccStretch.profile:
				if tccStretch.profile[i] > highestHit:
					highestHit = tccStretch.profile[i]		
		
		newLines.append(cg.appendToLine(line, str(highestHit), 11))
	
	mirFile.close()

	print 'Writing New File'
	#write new results file
	outFile = open(pFileName, 'w')
	for line in newLines:
		outFile.write(line)
	outFile.close()

	####NOW UPDATE HIGHEST HIT PER CLUSTER####

	clusterCount = {}

	pFile = open(pFileName, 'r')
	for line in pFile:
		predictionCount = int(line.strip().split('\t')[11])
		CID = line.strip().split('\t')[7]
		if CID in clusterCount:
			if clusterCount[CID] < predictionCount:
				clusterCount[CID] = predictionCount
		else:
			clusterCount[CID] = predictionCount
	pFile.close()

	#update the file --> cluster small count
	newLines = []
	predFile = open(pFileName, 'r')
	for line in predFile:
		CID = line.strip().split('\t')[7]
		numMax = clusterCount[CID]
		newLines.append(cg.appendToLine(line, str(numMax), 12))
	predFile.close()

	#sort newLines by clusterID
	sortDict = {}
	CIDs = []
	for line in newLines:
		CID = int(line.strip().split('\t')[7])
		if CID not in CIDs:
			CIDs.append(CID)
		if CID in sortDict:
			sortDict[CID].append(line)
		else:
			sortDict[CID] = [line]
		
	CIDs.sort()

	newLines = []
	for CID in CIDs:
		for line in sortDict[CID]:
			newLines.append(line)

	#write new File
	newFile = open(pFileName, 'w')
	for line in newLines:
		newFile.write(line)
	newFile.close()
示例#27
0
def makePeakInput(cName, minExpression = 2000):
	
	mConf = c.getConfig('Main.conf')
	conf = c.getConfig(cName)
	
	assembly = conf.conf['assembly']
	
	tccList = []
	
	chromLens = cg.returnChromLengthDict(assembly)
	f = open('peakData.%s' % minExpression, 'w')
	for chrom in chromLens:
		if chrom not in cg.acceptableChroms: continue
		for strand in ['1', '-1']:
			print 'Getting Peaks for ', chrom, strand
			prevI = 0
			endCheck = 0
			for i in rangePoints(1, chromLens[chrom], 1000):
				if i == 1:
					prevI = i
					continue
				
				start = prevI
				end = i
				prevI = i
				
				tcc = cg.makeTcc(chrom, strand, start, end)
				#print 'scanning range', tcc
				peaks = cgPeaks.stretch(tcc, cName)
				peaks.createPeaks(span = 3, minVal = minExpression)
				
				for x in peaks.peaks:
					
					if x < endCheck:
						continue
				
					#scan a 30 bp range around this point and find the best roof...
					pRange = 30
					rTcc = cg.makeTcc(chrom, strand, x, x + 1)
					
	
					#now make profile for roof...
					cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True)
					
					
					
					#now get highest stretch length and the rNext coord.
					minVal = .80
					highest = 0
					stretch = 0
					startCurrent = None
					startFinal = None
					endFinal = None
					for i in range(1 - pRange, pRange):
						if cProfile[i] > minVal:
							stretch += 1
							if startCurrent == None:
								startCurrent = i
						else:
							if stretch > 0:
								if stretch > highest: #stretch ended and was higher than previous
									highest = stretch
									endFinal = i - 1
									startFinal = startCurrent
									startCurrent = None
								else:
									startCurrent = None
							stretch = 0
					
					#get +/- 4 value...
					val = [1.0, 1.0]
					if (startFinal) and (endFinal):
						low = startFinal - 4
						high = endFinal + 4
						if low > (1 - pRange) and high < pRange:
								val[0] = float(cProfile[startFinal - 4])
								val[1] = float(cProfile[endFinal + 4])
						else:
							continue
					else:
						continue
					
					endCheck = x + high
					
					#filter out peaks that look a certain way.
					if 14 < highest < 26: #rooflength
						if val[0] < 0.2 and val[1] < .2: #drop values
							goodTcc = cg.makeTcc(chrom, strand, x + low, x + high)
							#print goodTcc
							f.write('%s\n' % goodTcc)
	f.close()
def markCenterExpressionOLD(smallFN, targetFN, alignmentFN, cName, outFN):
        
        #print 'making target dict'
        #make targetDict
        f = open(targetFN, 'r')
        targetDict = {} # tID: tLoc
        for line in f:
                ls = line.strip().split('\t')
                targetDict[int(ls[0])] = ls[1]
        f.close()

        #print 'making alignment dict'
        #make alignmentDict
        alignDict = {} # sid: {target: offset}
        f = open(alignmentFN, 'r')
        for line in f:
                ls = line.strip().split(' ')
                sID = int(ls[0])
                tID = int(ls[1])
                offset = int(ls[4])
                if not sID in alignDict:
                           alignDict[sID] = {}

                alignDict[sID][tID] = offset #assumes one source to target...
        f.close()

        f = open(smallFN, 'r')
        fOut = open(outFN, 'w')

        for line in f:
                ls = line.strip().split('\t')
                sID = int(ls[0])
                sLoc = ls[1]
                sLen = len(sLoc) #This is the sequence for simulated reads... 
                #sLen = bioLibCG.getTccLength(sLoc) #off by one?
                tIDs = ls[4].split(',')

                for tID in tIDs:
                        tID = int(tID)
                        tLoc = targetDict[tID]
                        chrom, strand, start, end = bioLibCG.tccSplit(tLoc)
                        offset = alignDict[sID][tID]

                        if strand == '1':
                                start = start - 19 + offset
                                end = start + sLen
                        else:
                                end = end + 19 - offset
                                start = end - sLen

                        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                        
                        stretch = cgPeaks.stretch(scanRange, cName)
                        expressionSum = stretch.getSumOfLevels()
                        sortedKeys = stretch.profile.keys()
                        sortedKeys.sort()

                        if strand == '-1':
                                sortedKeys.reverse()
                        

                        lowE = 0.0
                        midE = 0.0
                        highE = 0.0
                        

                        if expressionSum != 0:

                                sum = 0.0
                                for key in sortedKeys[8:12]:
                                        sum += stretch.profile[key]
                                lowE = sum/expressionSum

                                sum = 0.0
                                for key in sortedKeys[7:13]:
                                        sum += stretch.profile[key]
                                midE = sum/expressionSum

                                sum = 0.0
                                for key in sortedKeys[6:14]:
                                        sum += stretch.profile[key]
                                highE = sum/expressionSum

                        
                        fOut.write('%s\t%s\t%s\t%s\t%s\n' % (sID, tID, lowE, midE, highE))
示例#29
0
def updateReadDensity(tType, cName):
    #go through wig each chromosome and check the mature seqs
    mainConf = cgConfig.cgConfig('Main.conf')
    conf = cgConfig.getConfig(cName)
    organism = conf.conf['organism']
    wigFolder = mainConf.conf['wig%s' % organism]
    newLines = []

    #Differentiate between exon or intron...
    if tType == 'E':
        pFileName = conf.conf['resultsExons']
    elif tType == 'I':
        pFileName = conf.conf['resultsIntrons']
    else:
        print 'READ UPDATE FAIL'

    print '  Updating Read Density:', tType

    #get read density for each line...
    print '  calculating hits for mature seqs'
    #calculate total hits per mature
    mirFile = open(pFileName, 'r')
    for line in mirFile:
        mTcc = line.strip().split('\t')[1]
        mirID = line.strip().split('\t')[0]

        tccStretch = cgPeaks.stretch(mTcc, cName)
        highestHit = 0
        for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
            if i in tccStretch.profile:
                if tccStretch.profile[i] > highestHit:
                    highestHit = tccStretch.profile[i]

        newLines.append(cg.appendToLine(line, str(highestHit), 11))

    mirFile.close()

    print 'Writing New File'
    #write new results file
    outFile = open(pFileName, 'w')
    for line in newLines:
        outFile.write(line)
    outFile.close()

    ####NOW UPDATE HIGHEST HIT PER CLUSTER####

    clusterCount = {}

    pFile = open(pFileName, 'r')
    for line in pFile:
        predictionCount = int(line.strip().split('\t')[11])
        CID = line.strip().split('\t')[7]
        if CID in clusterCount:
            if clusterCount[CID] < predictionCount:
                clusterCount[CID] = predictionCount
        else:
            clusterCount[CID] = predictionCount
    pFile.close()

    #update the file --> cluster small count
    newLines = []
    predFile = open(pFileName, 'r')
    for line in predFile:
        CID = line.strip().split('\t')[7]
        numMax = clusterCount[CID]
        newLines.append(cg.appendToLine(line, str(numMax), 12))
    predFile.close()

    #sort newLines by clusterID
    sortDict = {}
    CIDs = []
    for line in newLines:
        CID = int(line.strip().split('\t')[7])
        if CID not in CIDs:
            CIDs.append(CID)
        if CID in sortDict:
            sortDict[CID].append(line)
        else:
            sortDict[CID] = [line]

    CIDs.sort()

    newLines = []
    for CID in CIDs:
        for line in sortDict[CID]:
            newLines.append(line)

    #write new File
    newFile = open(pFileName, 'w')
    for line in newLines:
        newFile.write(line)
    newFile.close()
示例#30
0
import math
import bioLibCG as cg

knowns = compare.tccFileToList('mouseKnownMirs.tcc', 0)

eLevels = []
for known in knowns:

    chrom, strand, start, end = cg.tccSplit(known, True)  #text...
    if strand == '1':
        strand = '-1'
    else:
        strand = '1'
    oppTcc = cg.makeTcc(chrom, strand, start, end)

    knownStretch = cgPeaks.stretch(known)
    knownStretch.createPeaks(1, 20)
    kPos = knownStretch.getHighestPeak()
    if kPos: eLevels.append(knownStretch.profile[kPos])

    oppStretch = cgPeaks.stretch(oppTcc)
    oppStretch.createPeaks(1, 20)
    oPos = oppStretch.getHighestPeak()

    if oPos and kPos:
        #determine if they are close enough to be considered mirrored...
        if math.fabs(int(kPos) - int(oPos)) < 12:
            print known, oPos, kPos, oppStretch.profile[
                oPos], knownStretch.profile[kPos]

print eLevels
示例#31
0
def parallelMakePeaks(tcc, cName, minExpression):
    conf = c.getConfig(cName)
    f = open(
        'out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']),
        'w')
    print 'scanning range', tcc
    chrom, strand, start, end = cg.tccSplit(tcc)
    peaks = cgPeaks.stretch(tcc, cName)

    #print 'getting peaks'
    peaks.createPeaks(span=1, minVal=int(minExpression))

    print 'len peaks', len(peaks.peaks)
    endCheck = 0
    for x in peaks.peaks:
        print x, endCheck
        '''
		if x < endCheck:
                        print 'endChecked'
			continue
	        '''

        #scan a 30 bp range around this point and find the best roof...
        pRange = 40
        rTcc = cg.makeTcc(chrom, strand, x, x + 1)

        #now make profile for roof...
        cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                     pRange,
                                                     cName,
                                                     ratio=True)

        #now get highest stretch length and the rNext coord.
        minVal = .70
        highest = 0
        stretch = 0
        startCurrent = None
        startFinal = None
        endFinal = None
        for i in range(1 - pRange, pRange):
            print ' ', x + i, cProfile[i]
            if cProfile[i] > minVal:
                print '  extending stretch'
                stretch += 1
                if startCurrent == None:
                    startCurrent = i
            else:
                if stretch > 0:
                    print 'end of stretch'
                    if stretch > highest:  #stretch ended and was higher than previous
                        highest = stretch
                        endFinal = i - 1
                        startFinal = startCurrent
                        startCurrent = None
                    else:
                        startCurrent = None
                stretch = 0

        #get +/- extend value...
        val = [1.0, 1.0]
        extend = 1
        if (startFinal) and (endFinal):
            low = startFinal - extend
            high = endFinal + extend
            if low > (1 - pRange) and high < pRange:
                val[0] = float(cProfile[startFinal - extend])
                val[1] = float(cProfile[endFinal + extend])
            else:
                print 'out of range'
                continue
        else:
            print 'no start and end of peak'
            continue
        print low, high, x, endFinal
        endCheck = x + endFinal

        #avg expression around peak check...
        #get total expression before peak
        noiseExpression = 0
        lowRange = range(1 - pRange, low)
        highRange = range(high + 1, pRange)
        totalLength = len(lowRange) + len(highRange)
        for i in lowRange:
            noiseExpression += cProfile[i]
        for i in highRange:
            noiseExpression += cProfile[i]
        avgNoise = noiseExpression / float(totalLength)

        #filter out peaks that look a certain way.
        print highest, val[0], val[1], avgNoise
        if 0 < highest < 5:  #rooflength 14/26
            if val[0] < 0.20 and val[1] < .20:  #drop values
                if avgNoise < .3:
                    goodTcc = cg.makeTcc(chrom, strand, x + low, x + high)
                    print '*KEEPER'
                    f.write('%s\n' % goodTcc)

    f.close()
    print 'DONE', tcc
def makeFigure(fN, targetFN, alignmentFN, cName):
        #make targetDict
        f = open(targetFN, 'r')
        targetDict = {} # tID: tLoc
        for line in f:
                ls = line.strip().split('\t')
                targetDict[int(ls[0])] = ls[1]
        f.close()

        #make alignmentDict
        alignDict = {} # sid: {target: offset}
        f = open(alignmentFN, 'r')
        for line in f:
                ls = line.strip().split(' ')
                sID = int(ls[0])
                tID = int(ls[1])
                offset = int(ls[4])
                if not sID in alignDict:
                           alignDict[sID] = {}

                alignDict[sID][tID] = offset #assumes one source to target...
        f.close()

        f = open(fN, 'r')
        
        histoVals = []

        for line in f:
                ls = line.strip().split('\t')
                sID = int(ls[0])
                sLoc = ls[1]
                sChrom, sStrand, sStart, sEnd = bioLibCG.tccSplit(sLoc)
                sLen = sEnd - sStart
                tIDs = ls[4].split(',')

                for tID in tIDs:
                        tID = int(tID)
                        tLoc = targetDict[tID]
                        chrom, strand, start, end = bioLibCG.tccSplit(tLoc)
                        offset = alignDict[sID][tID]

                        if sStrand == '1':
                                start = start - 19 + offset
                                end = start + sLen
                        else:
                                end = end + 19 - offset
                                start = end - sLen

                        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                                
                        stretch = cgPeaks.stretch(scanRange, cName)
                        highest = stretch.getHighestLevel()
                        sortedKeys = stretch.profile.keys()

                        if sStrand == '-1':
                                sortedKeys.reverse()

                        i = 0
                        for key in sortedKeys:
                                level = stretch.profile[key]
                                for j in range(0,level):
                                        histoVals.append(i)
                                i += 1




        cgPlot.plotHistogram(histoVals)
示例#33
0
def findPeaks(pType, cName=None):

    #init
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    if pType == 'E':
        predName = conf.conf['resultsExonsSorted']
    else:
        predName = conf.conf['resultsIntronsSorted']

    print predName
    #make CID:hairpin:peak dictionary
    cHairs = getHairpins.getHairpins(predName)
    peakDict = {}
    for CID in cHairs:
        peakDict[CID] = [cHairs[CID], 'None']

    timer = cg.cgTimer()
    timer.start()

    #put peaks in memory
    print 'Creating peak data'
    peaks = {}  # chr:peak:value
    for CID in cHairs:
        chrom, strand, start, end = cg.tccSplit(cHairs[CID])
        tcc = cHairs[CID]

        #init dictionary
        if chrom not in peaks:
            peaks[chrom] = {}

        if strand not in peaks[chrom]:
            peaks[chrom][strand] = {}

        #create peaks for tcc and add to peak dictionary
        stretch = cgPeaks.stretch(tcc, cName)
        stretch.createPeaks()
        for peakCoord in stretch.peaks:
            peaks[chrom][strand][peakCoord] = 0
    print timer.split()

    print 'finding best combos'
    bestCombos = []
    aPass = 0
    bPass = 0
    cPass = 0
    numT = 0
    for CID in peakDict:
        cgFlag = False
        if CID == '538': cgFlag = True

        tcc = peakDict[CID][0]
        #print tcc
        tccPeaks = []
        chrom = cg.ss(tcc, ':')[0]
        strand = cg.ss(tcc, ':')[1]
        start = int(cg.ss(tcc, ':')[2])
        end = int(cg.ss(tcc, ':')[3])

        #get all peaks
        for i in range(start, end + 1):
            if i in peaks[chrom][strand]:
                #print '  peak added', i
                tccPeaks.append(i)

        #Calculate parameters...
        pairStrings = []  #used to check if pair already added
        peakCombos = []
        for x in tccPeaks:

            #scan a 30 bp range around this point and find the best roof...
            pRange = 30
            rTcc = cg.makeTcc(chrom, strand, x, x + 1)

            #quickly get max value...kinda a long way to do it but whatever
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         1,
                                                         cName,
                                                         ratio=False)
            xval = cProfile[0]
            max = xval
            highestValueCoord = x

            #now make profile for roof...
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         pRange,
                                                         cName,
                                                         ratio=True)

            #now get highest stretch length and the rNext coord.
            minVal = .80
            highest = 0
            stretch = 0
            startCurrent = None
            startFinal = None
            endFinal = None
            for i in range(1 - pRange, pRange):
                if cProfile[i] > minVal:
                    stretch += 1
                    if startCurrent == None:
                        startCurrent = i
                else:
                    if stretch > 0:
                        if stretch > highest:  #stretch ended and was higher than previous
                            highest = stretch
                            endFinal = i - 1
                            startFinal = startCurrent
                            startCurrent = None
                        else:
                            startCurrent = None
                    stretch = 0

            #get +/- 4 value...
            val = [1.0, 1.0]
            if (startFinal) and (endFinal):
                low = startFinal - 4
                high = endFinal + 4
                if low > (1 - pRange):
                    if high < pRange:
                        val[0] = float(cProfile[startFinal - 4])
                        val[1] = float(cProfile[endFinal + 4])

            #fill in other details...
            y = 'S'
            dist = 'S'
            ratio = 'S'

            peakCombos.append([tcc, x, y, dist, ratio, max, highest, val])
            #print '  ', peakCombos[-1]

        #find best combo...
        topCombo = None
        for combo in peakCombos:
            roofLength = combo[6]
            dropValue = combo[7][0]
            if combo[7][1] > dropValue:
                dropValue = combo[7][1]

            #print roofLength, dropValue
            if 14 < roofLength < 26:
                if 0.0 < dropValue < 0.2:
                    #pick one with rooflength nearest 20:
                    if topCombo:
                        if (math.fabs(22 - roofLength)) < (
                                math.fabs(22 - topCombo[6])):
                            topCombo = combo
                    else:
                        topCombo = combo

        if topCombo:
            peakDict[CID][1] = topCombo
            bestCombos.append(topCombo)
            print bestCombos[-1]
        else:
            #print 'None'
            pass

    print timer.split()

    #now update predFile (SLOT 13)
    predFile = open(predName, 'r')
    newLines = []
    for line in predFile:
        CID = cg.ss(line)[7]
        if peakDict[CID][1] == 'None':
            peakInfo = 'None'
        else:
            peakInfo = '%s:%s:%s:%s:%s:%s' % (
                str(peakDict[CID][1][1])[-3:], 'S', str(
                    peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],
                peakDict[CID][1][6], peakDict[CID][1][7])
        newLines.append(cg.appendToLine(line, peakInfo, 13))
    predFile.close()

    predFile = open(predName, 'w')
    predFile.writelines(newLines)
    predFile.close()