Пример #1
0
def smallAnalyze(inFile = None):
	
	#Caste and defaults
	if not inFile:
		print 'need inFile'
		return 1
	
	# start Timer
	timer = cg.cgTimer()
	timer.start()
	
	# for every id in outfile, count how many matches there are
	countDict = {} 
	countFile = open(inFile, 'r')
	
	for line in countFile:
		(id, library) = (line.strip().split(':')[1], line.strip().split(':')[0])
		if id not in countDict:
			countDict[id] = {}
		else:
			if library not in countDict[id]:
				countDict[id][library] = 1
			else:
				countDict[id][library] = countDict[id][library] + 1
	#print 'Time for counting lib hits: ', timer.split()
	
	sortList = []
	for id in countDict:
		sortList.append(id)
	sortList.sort()
	
	for id in sortList:
		print '%s' % id
		for lib in countDict[id]:
			print '%s\t%s' % (lib, countDict[id][lib])
def markCenterExpression(aFN, wigDir, rn = None, tn = None):

        extend = 25
        
        timer = bioLibCG.cgTimer()
        timer.start()

        aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
        aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'], [rn, tn])
        
        #load expression of degradome
        wigDict = cgWig.loadWigDict(wigDir)
        
        for aID in aNX.centerExpression:
                aNX.centerExpression[aID] = [0.0, 0.0, 0.0]      
                chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID])
                offset = aNX.tStart[aID]
                sLen = aNX.sLength[aID]

                if strand == '1':
                        start = start - extend + offset
                        end = start + sLen
                else:
                        end = end + extend - offset
                        start = end - sLen

                scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                stretch = cgWig.getExpressionProfile(scanRange, wigDict)

                #make sure peak is in the small range
                peakLevel = aNX.tELevel[aID]
                peakInRange = (peakLevel in stretch.values())
                

                expressionSum = sum(stretch.values())
                sortedKeys = stretch.keys()
                sortedKeys.sort()

                if strand == '-1':
                        sortedKeys.reverse()
                

                if expressionSum != 0 and peakInRange:

                        sumE = 0.0
                        for key in sortedKeys[8:12]:
                                sumE += stretch[key]
                        aNX.centerExpression[aID][0] = sumE/expressionSum

                        sumE = 0.0
                        for key in sortedKeys[7:13]:
                                sumE += stretch[key]
                        aNX.centerExpression[aID][1] = sumE/expressionSum

                        sumE = 0.0
                        for key in sortedKeys[6:14]:
                                sumE += stretch[key]
                        aNX.centerExpression[aID][2] = sumE/expressionSum
        
        aNX.save()
Пример #3
0
def alignSeqs(seqsFN, dbName, wordSize, outFN, maxNumMismatches, sendExitSignal = False):
        maxNumMismatches = int(maxNumMismatches)
        sendExitSignal = bool(sendExitSignal)

        timer = bioLibCG.cgTimer()
        timer.start()
        #put seqs in cgSeq object, align
        wName = dbName + '.wDB'
        sName = dbName + '.sDB'
        wordSize = int(wordSize)
        
        #load dbs
        #print 'loading Sequence Database'
        sDB = cgAlign.loadSequenceDatabase(sName)
        print timer.split()
        #print 'loading Word Database'
        wDB = cgAlign.loadWordDatabase(wName)
        print timer.split()

        #align each seq
        f = open(seqsFN, 'r')
        fOut = open(outFN, 'w')
        for line in f:
                qSeq = cgAlign.cgSeq(line.strip().split('\t')[0], line.strip().split('\t')[1])
                
                #write out the alignments
                cgAlign.alignQuery(qSeq, wDB, sDB, wordSize, maxNumMismatches, fOut)
        
        f.close()
        fOut.close()
        
        print timer.split()
        if sendExitSignal:
                cgExit.sendExitSignal(seqsFN)
Пример #4
0
def scanVectorsFile(fN, tccList):
	'''Given tcc list --> scan wig files and return coord:value...
	'''	
	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
				
		stop = False
		for line in fIndex.file:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		#fIndex.close()
	return coordDict
Пример #5
0
def updatePolySeqs(mFN, readsFN, alignFN):

    tim = bioLibCG.cgTimer()
    tim.start()
    variousAs = ["A" * x for x in range(1,20)]
    variousGs = ["G" * x for x in range(1,20)]
    variousTs = ["T" * x for x in range(1,20)]
    variousCs = ["C" * x for x in range(1,20)]

    letter_variousLetters = [ ("A", variousAs),
                            ("G", variousGs),
                            ("T", variousTs),
                            ("C", variousCs)]


    checkRange = range(1,8)

    NX = cgNexusFlat.Nexus(mFN, miR)
    NX.load(['sequence', 'polySeqs'])
    #print 'load micro', tim.split() 

    reads = cgNexusFlat.quickTable(('read','string', '.', 1))
    rNX = cgNexusFlat.Nexus(readsFN, reads)
    rNX.load(['read'])
    #print 'load reads', tim.split() 

    aNX = cgNexusFlat.Nexus(alignFN, cgAlignment)
    aNX.load(['sID', 'tID'])
    #print 'load alignments', tim.split() 

    for id in aNX.ids:

        theRead = rNX.read[aNX.sID[id]]
        mID = aNX.tID[id]
        microSeq = NX.sequence[mID]

        #may be a read for expression, but wont count...
        if theRead in microSeq: continue

        #just for expression
        if microSeq == theRead: 
            print tabIt(microSeq, theRead, 0, 0, "N")

        #first check full
        elif microSeq in theRead and (len(theRead) != len(microSeq)):
            tail = theRead.split(microSeq)[1]
            for let, variousLetters in letter_variousLetters:
                if tail in variousLetters:
                    print tabIt(microSeq, theRead, 0, len(tail), let)

        #now check trimmed (cant do [:-0])
        else:
            for i in checkRange:
                if microSeq[:-i] in theRead and (len(theRead) != len(microSeq[:-i])):
                    tail = theRead.split(microSeq[:-i])[1]
                    for let, variousLetters in letter_variousLetters:
                        if tail in variousLetters:
                            print tabIt(microSeq, theRead, i, len(tail), let)
                            print "TRIMMED"
                    break #dont trim after the first trimmed one works                           
Пример #6
0
def scanVectorsFile(fN, tccList):
	'''Given tcc list --> scan wig files and return coord:value...
	'''	
	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
				
		stop = False
		for line in fIndex.file:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		#fIndex.close()
	return coordDict
Пример #7
0
def markCenterExpression(aFN, wigDir, rn=None, tn=None):

    extend = 25

    timer = bioLibCG.cgTimer()
    timer.start()

    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'],
             [rn, tn])

    #load expression of degradome
    wigDict = cgWig.loadWigDict(wigDir)

    for aID in aNX.centerExpression:
        aNX.centerExpression[aID] = [0.0, 0.0, 0.0]
        chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID])
        offset = aNX.tStart[aID]
        sLen = aNX.sLength[aID]

        if strand == '1':
            start = start - extend + offset
            end = start + sLen
        else:
            end = end + extend - offset
            start = end - sLen

        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
        stretch = cgWig.getExpressionProfile(scanRange, wigDict)

        #make sure peak is in the small range
        peakLevel = aNX.tELevel[aID]
        peakInRange = (peakLevel in stretch.values())

        expressionSum = sum(stretch.values())
        sortedKeys = stretch.keys()
        sortedKeys.sort()

        if strand == '-1':
            sortedKeys.reverse()

        if expressionSum != 0 and peakInRange:

            sumE = 0.0
            for key in sortedKeys[8:12]:
                sumE += stretch[key]
            aNX.centerExpression[aID][0] = sumE / expressionSum

            sumE = 0.0
            for key in sortedKeys[7:13]:
                sumE += stretch[key]
            aNX.centerExpression[aID][1] = sumE / expressionSum

            sumE = 0.0
            for key in sortedKeys[6:14]:
                sumE += stretch[key]
            aNX.centerExpression[aID][2] = sumE / expressionSum

    aNX.save()
Пример #8
0
def alignSeqs(seqsFN,
              dbName,
              wordSize,
              outFN,
              maxNumMismatches,
              sendExitSignal=False):
    maxNumMismatches = int(maxNumMismatches)
    sendExitSignal = bool(sendExitSignal)

    timer = bioLibCG.cgTimer()
    timer.start()
    #put seqs in cgSeq object, align
    wName = dbName + '.wDB'
    sName = dbName + '.sDB'
    wordSize = int(wordSize)

    #load dbs
    #print 'loading Sequence Database'
    sDB = cgAlign.loadSequenceDatabase(sName)
    print timer.split()
    #print 'loading Word Database'
    wDB = cgAlign.loadWordDatabase(wName)
    print timer.split()

    #align each seq
    f = open(seqsFN, 'r')
    fOut = open(outFN, 'w')
    for line in f:
        qSeq = cgAlign.cgSeq(line.strip().split('\t')[0],
                             line.strip().split('\t')[1])

        #write out the alignments
        cgAlign.alignQuery(qSeq, wDB, sDB, wordSize, maxNumMismatches, fOut)

    f.close()
    fOut.close()

    print timer.split()
    if sendExitSignal:
        cgExit.sendExitSignal(seqsFN)
Пример #9
0
def smallAnalyze(inFile=None):

    #Caste and defaults
    if not inFile:
        print 'need inFile'
        return 1

    # start Timer
    timer = cg.cgTimer()
    timer.start()

    # for every id in outfile, count how many matches there are
    countDict = {}
    countFile = open(inFile, 'r')

    for line in countFile:
        (id, library) = (line.strip().split(':')[1],
                         line.strip().split(':')[0])
        if id not in countDict:
            countDict[id] = {}
        else:
            if library not in countDict[id]:
                countDict[id][library] = 1
            else:
                countDict[id][library] = countDict[id][library] + 1
    #print 'Time for counting lib hits: ', timer.split()

    sortList = []
    for id in countDict:
        sortList.append(id)
    sortList.sort()

    for id in sortList:
        print '%s' % id
        for lib in countDict[id]:
            print '%s\t%s' % (lib, countDict[id][lib])
Пример #10
0
def scanVectorsSingleCoord(tccList, cName):
    '''Given tcc list --> scan wig files and coord:value...
	'''

    conf = c.getConfig(cName)
    org = conf.conf['organism']
    mConf = c.getConfig('Main.conf')
    wigDir = mConf.conf['wig%s' % org]

    timer = cg.cgTimer()
    timer.start()
    coordDict = {}  # tcc: [list values]
    for tcc in tccList:
        theSplit = ss(tcc, ':')
        chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int(
            theSplit[2]), int(theSplit[3])

        #goto correct fild, correct line in index

        fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom)
        fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),
                                                              strand, chrom)

        #print timer.split()
        #get line in index file
        iFile = open(fNindex, 'r')
        startByte = 'None'
        for line in iFile:
            beg = int(cg.ss(line)[1])
            end = int(cg.ss(line)[2])

            if beg <= tccStart < end:
                startByte = int(cg.ss(line)[0])
                #print 'INDEX', line.strip()
                break
        iFile.close()

        #print timer.split()
        #grab value
        f = open(fN, 'r')
        f.seek(startByte, 0)

        stop = False
        for line in f:
            #print 'Line:', line.strip()
            lBeg = int(cg.ss(line)[1])
            lEnd = int(cg.ss(line)[2])
            lValue = int(cg.ss(line)[3].split('.')[0])

            if tccStart > lBeg:
                lBeg = tccStart
            if tccEnd < lEnd:
                lEnd = tccEnd
                stop = True
            #print timer.split()

            for i in range(lBeg, lEnd):
                coordDict[i] = lValue

            if stop: break

        f.close()
    return coordDict
Пример #11
0
def scanVectorsHist(tccList, cName):
    '''Given tcc list --> scan wig files and get histogram values
	can be modified to do single/total values...
	THIS USES INDEXES!!! = BAD...'''

    conf = c.getConfig(cName)
    org = conf.conf['organism']
    mConf = c.getConfig('Main.conf')
    wigDir = mConf.conf['wig%s' % org]

    timer = cg.cgTimer()
    timer.start()
    histDict = {}  # tcc: [list values]
    for tcc in tccList:
        theSplit = ss(tcc, ':')
        chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int(
            theSplit[2]), int(theSplit[3])

        #goto correct fild, correct line in index

        fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom)
        fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),
                                                              strand, chrom)

        #print timer.split()
        #get line in index file
        iFile = open(fNindex, 'r')
        startByte = 'None'
        for line in iFile:
            beg = int(cg.ss(line)[1])
            end = int(cg.ss(line)[2])

            if beg <= tccStart < end:
                startByte = int(cg.ss(line)[0])
                #print 'INDEX', line.strip()
                break
        iFile.close()

        #print timer.split()
        #grab value
        f = open(fN, 'r')
        f.seek(startByte, 0)

        stop = False
        for line in f:
            #print 'Line:', line.strip()
            lBeg = int(cg.ss(line)[1])
            lEnd = int(cg.ss(line)[2])
            lValue = int(cg.ss(line)[3].split('.')[0])

            if tccStart > lBeg:
                lBeg = tccStart
            if tccEnd < lEnd:
                lEnd = tccEnd
                stop = True
            #print timer.split()

            for i in range(lBeg, lEnd):
                try:
                    histDict[tcc].append(lValue)
                except KeyError:  #just for zero...so you don't have to if every time...
                    histDict[tcc] = [lValue]
            if stop: break

        f.close()
        #print timer.split()
    return histDict
Пример #12
0
def findPeaks(pType, cName = None):
	
	#init
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)

	if pType == 'E':
		predName = conf.conf['resultsExonsSorted']
	else:
		predName = conf.conf['resultsIntronsSorted']
	
	print predName
	#make CID:hairpin:peak dictionary
	cHairs = getHairpins.getHairpins(predName)
	peakDict = {}
	for CID in cHairs:
		peakDict[CID] = [cHairs[CID],'None']
		

	timer = cg.cgTimer()
	timer.start()

	#put peaks in memory
	print 'Creating peak data'
	peaks = {} # chr:peak:value
	for CID in cHairs:
		chrom, strand, start, end = cg.tccSplit(cHairs[CID])
		tcc = cHairs[CID]
		
		#init dictionary
		if chrom not in peaks:
			peaks[chrom] = {}
		
		if strand not in peaks[chrom]:
			peaks[chrom][strand] = {}
		
		#create peaks for tcc and add to peak dictionary
		stretch = cgPeaks.stretch(tcc, cName)
		stretch.createPeaks()
		for peakCoord in stretch.peaks:
			peaks[chrom][strand][peakCoord] = 0
	print timer.split()

	print 'finding best combos'
	bestCombos = []
	aPass = 0
	bPass = 0
	cPass = 0
	numT = 0
	for CID in peakDict:
		cgFlag = False
		if CID == '538':cgFlag = True
		
		tcc = peakDict[CID][0]
		#print tcc
		tccPeaks = []
		chrom = cg.ss(tcc, ':')[0]
		strand = cg.ss(tcc, ':')[1]
		start = int(cg.ss(tcc, ':')[2])
		end = int(cg.ss(tcc, ':')[3])
		
		#get all peaks
		for i in range(start, end + 1):
			if i in peaks[chrom][strand]:
				#print '  peak added', i
				tccPeaks.append(i)
		
		#Calculate parameters...
		pairStrings = [] #used to check if pair already added
		peakCombos = []
		for x in tccPeaks:
				
								
				#scan a 30 bp range around this point and find the best roof...
				pRange = 30
				rTcc = cg.makeTcc(chrom, strand, x, x + 1)
				
				#quickly get max value...kinda a long way to do it but whatever
				cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False)
				xval = cProfile[0]
				max = xval
				highestValueCoord = x
				
				#now make profile for roof...
				cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True)
				
				
				
				#now get highest stretch length and the rNext coord.
				minVal = .80
				highest = 0
				stretch = 0
				startCurrent = None
				startFinal = None
				endFinal = None
				for i in range(1 - pRange, pRange):
					if cProfile[i] > minVal:
						stretch += 1
						if startCurrent == None:
							startCurrent = i
					else:
						if stretch > 0:
							if stretch > highest: #stretch ended and was higher than previous
								highest = stretch
								endFinal = i - 1
								startFinal = startCurrent
								startCurrent = None
							else:
								startCurrent = None
						stretch = 0
				
				#get +/- 4 value...
				val = [1.0, 1.0]
				if (startFinal) and (endFinal):
					low = startFinal - 4
					high = endFinal + 4
					if low > (1 - pRange):
						if high < pRange:
							val[0] = float(cProfile[startFinal - 4])
							val[1] = float(cProfile[endFinal + 4])
				
				#fill in other details...
				y = 'S'
				dist = 'S'
				ratio = 'S'
				
				peakCombos.append([tcc,x,y,dist,ratio,max,highest,val])
				#print '  ', peakCombos[-1]
		
		#find best combo...
		topCombo = None
		for combo in peakCombos:
			roofLength = combo[6]
			dropValue = combo[7][0]
			if combo[7][1] > dropValue:
				dropValue = combo[7][1]
			
			#print roofLength, dropValue
			if 14 < roofLength < 26:
				if 0.0 < dropValue < 0.2:
					#pick one with rooflength nearest 20:
					if topCombo:
						if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])):
							topCombo = combo
					else:
						topCombo = combo
		
		if topCombo:
			peakDict[CID][1] = topCombo
			bestCombos.append(topCombo)
			print bestCombos[-1]
		else:
			#print 'None'
			pass

	print timer.split()


	#now update predFile (SLOT 13)
	predFile = open(predName, 'r')
	newLines = []
	for line in predFile:
		CID = cg.ss(line)[7]
		if peakDict[CID][1] == 'None':
			peakInfo = 'None'
		else:
			peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7])
		newLines.append(cg.appendToLine(line, peakInfo, 13))
	predFile.close()

	predFile = open(predName, 'w')
	predFile.writelines(newLines)
	predFile.close()
Пример #13
0
def scanVectorsSingleCoord(tccList, cName):
	'''Given tcc list --> scan wig files and coord:value...
	'''
	
	conf = c.getConfig(cName)
	org = conf.conf['organism']
	mConf = c.getConfig('Main.conf')
	wigDir = mConf.conf['wig%s' % org]

	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		theSplit = ss(tcc, ':')
		chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3])
		
		#goto correct fild, correct line in index
		
		fN = wigDir + '/Merge.%s.%s.wig.%s.wig' %  (org.lower(),strand,chrom)
		fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom)
		
		#print timer.split()
		#get line in index file
		iFile = open(fNindex, 'r')
		startByte = 'None'
		for line in iFile:
			beg = int(cg.ss(line)[1])
			end = int(cg.ss(line)[2])
			
			if beg <= tccStart < end:
				startByte = int(cg.ss(line)[0]) 
				#print 'INDEX', line.strip()
				break
		iFile.close()
		
		#print timer.split()
		#grab value
		f = open(fN, 'r')
		f.seek(startByte, 0)
		
		stop = False
		for line in f:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		f.close()
	return coordDict
Пример #14
0
#puts final hits into clusters...
##Clusters are based off of overlapping neighbors, if you have an overlapping neighbor than you are part of that cluster.
import bioLibCG as cg
import subprocess
import compareData as compare
import cgConfig


#Start Timer
timer = cg.cgTimer()
timer.start()

#Get list of mature tccs
conf = cgConfig.returnConfDict()
finalMirFileName = '/u/home8/gxxiao/chrisgre/projects/PipeRuns/LanderHuman/out/LanderHuman-s3k8b17.ALL.FINAL.mirs.tsv'
finalMirFileName = conf['resultsRaw']
matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc
print 'List getting', timer.split()


#make connections dict
matureConnections = compare.makeConnectionsDict(matureTccs)
print 'Make connections:', timer.split()

#Now have to define Clusters...
clusters = []
addedList = []

#I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P
def createClusters(item = None, mode = None):
		
Пример #15
0
def defineClusters(cName = None):
	#Start Timer
	timer = cg.cgTimer()
	timer.start()
	
	#Get list of mature tccs
	conf = cgConfig.getConfig(cName) #passed or default
	finalMirFileName = conf.conf['resultsRaw']
	matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc
	print 'List getting', timer.split()
	
	
	#make connections dict
	matureConnections = compare.makeConnectionsDict(matureTccs)
	print 'Make connections:', timer.split()
	
	#Now have to define Clusters...
	clusters = []
	addedList = []
	
	#I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P
	def createClusters(item = None, mode = None):
			
		if item in addedList:
			return 0
		elif mode == "top":
			clusters.append([item])
			addedList.append(item) ##creates new cluster with the item already stored in it
			for connectedItem in matureConnections[item]:
				createClusters(connectedItem, "neighbor")
		elif mode == "neighbor":
			clusters[-1].append(item) #add this item to the last cluster created
			addedList.append(item)
			for connectedItem in matureConnections[item]:
				createClusters(connectedItem, "neighbor")
		
	for tcc in matureTccs:
		createClusters(tcc, "top")
	
	print 'Make Clusters', timer.split()
	
	
	#Sort Clusters.
	sortedClusters = []
	
	for cluster in clusters:
		sortedClusters.append(cg.sortTccList(cluster))
	
	print 'Sort Clusters:', timer.split()
	
	
	#Output sorted cluster file
	clusterFileName = conf.conf['sortedClusters']
	clusterFile = open(clusterFileName, 'w')
	for cluster in sortedClusters:
		for hit in cluster:
			clusterFile.write('%s,' % hit)
		clusterFile.write('\n')
	clusterFile.close()
	
	'''
	#re-create sortedClusters list:
	clusterFileName = 'sortedClusters.data'
	clusterFile = open(clusterFileName, 'r')
	sortedClusters = []
	
	
	for line in clusterFile:
		sortedClusters.append([])
		line = line.strip()[0:-1] #take off last comma ;P
		for hit in (line.strip().split(',')):
			sortedClusters[-1].append(hit)
	'''
	
	
	print 'Store intermediate data:', timer.split()
	
	
	#output hitsAround file
	outputFile = open(conf.conf['hitsPerFrame'], 'w')
	
	frameLength = 200
	frameShift = 1
	for cluster in sortedClusters:
		#grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope
		clusterChrom = cluster[0].split(":")[0]
		clusterStrand = cluster[0].split(":")[1]
		firstCoord = int(cluster[0].split(":")[2])
		#print cluster[-1]
		lastCoord = int(cluster[-1].split(":")[3])
		
		
		startCoord = firstCoord
		while startCoord < lastCoord:
			#count how many hits there are in this range
			rangeStart = startCoord - (frameLength/2)
			rangeEnd = startCoord + (frameLength/2)
			rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd)
			overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2)
			hitCount = len(overlappedList) 
			
			#output 
			outputFile.write('%s\t%s\n' % (rangeTcc, hitCount))
			startCoord = startCoord + frameShift #check overlap with range
	outputFile.close()
	
	print 'Output Hits per Frame:', timer.split()
	print 'Overall Time:', timer.report()
Пример #16
0
def findPeaks(pType, cName=None):

    #init
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    if pType == 'E':
        predName = conf.conf['resultsExonsSorted']
    else:
        predName = conf.conf['resultsIntronsSorted']

    print predName
    #make CID:hairpin:peak dictionary
    cHairs = getHairpins.getHairpins(predName)
    peakDict = {}
    for CID in cHairs:
        peakDict[CID] = [cHairs[CID], 'None']

    timer = cg.cgTimer()
    timer.start()

    #put peaks in memory
    print 'Creating peak data'
    peaks = {}  # chr:peak:value
    for CID in cHairs:
        chrom, strand, start, end = cg.tccSplit(cHairs[CID])
        tcc = cHairs[CID]

        #init dictionary
        if chrom not in peaks:
            peaks[chrom] = {}

        if strand not in peaks[chrom]:
            peaks[chrom][strand] = {}

        #create peaks for tcc and add to peak dictionary
        stretch = cgPeaks.stretch(tcc, cName)
        stretch.createPeaks()
        for peakCoord in stretch.peaks:
            peaks[chrom][strand][peakCoord] = 0
    print timer.split()

    print 'finding best combos'
    bestCombos = []
    aPass = 0
    bPass = 0
    cPass = 0
    numT = 0
    for CID in peakDict:
        cgFlag = False
        if CID == '538': cgFlag = True

        tcc = peakDict[CID][0]
        #print tcc
        tccPeaks = []
        chrom = cg.ss(tcc, ':')[0]
        strand = cg.ss(tcc, ':')[1]
        start = int(cg.ss(tcc, ':')[2])
        end = int(cg.ss(tcc, ':')[3])

        #get all peaks
        for i in range(start, end + 1):
            if i in peaks[chrom][strand]:
                #print '  peak added', i
                tccPeaks.append(i)

        #Calculate parameters...
        pairStrings = []  #used to check if pair already added
        peakCombos = []
        for x in tccPeaks:

            #scan a 30 bp range around this point and find the best roof...
            pRange = 30
            rTcc = cg.makeTcc(chrom, strand, x, x + 1)

            #quickly get max value...kinda a long way to do it but whatever
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         1,
                                                         cName,
                                                         ratio=False)
            xval = cProfile[0]
            max = xval
            highestValueCoord = x

            #now make profile for roof...
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         pRange,
                                                         cName,
                                                         ratio=True)

            #now get highest stretch length and the rNext coord.
            minVal = .80
            highest = 0
            stretch = 0
            startCurrent = None
            startFinal = None
            endFinal = None
            for i in range(1 - pRange, pRange):
                if cProfile[i] > minVal:
                    stretch += 1
                    if startCurrent == None:
                        startCurrent = i
                else:
                    if stretch > 0:
                        if stretch > highest:  #stretch ended and was higher than previous
                            highest = stretch
                            endFinal = i - 1
                            startFinal = startCurrent
                            startCurrent = None
                        else:
                            startCurrent = None
                    stretch = 0

            #get +/- 4 value...
            val = [1.0, 1.0]
            if (startFinal) and (endFinal):
                low = startFinal - 4
                high = endFinal + 4
                if low > (1 - pRange):
                    if high < pRange:
                        val[0] = float(cProfile[startFinal - 4])
                        val[1] = float(cProfile[endFinal + 4])

            #fill in other details...
            y = 'S'
            dist = 'S'
            ratio = 'S'

            peakCombos.append([tcc, x, y, dist, ratio, max, highest, val])
            #print '  ', peakCombos[-1]

        #find best combo...
        topCombo = None
        for combo in peakCombos:
            roofLength = combo[6]
            dropValue = combo[7][0]
            if combo[7][1] > dropValue:
                dropValue = combo[7][1]

            #print roofLength, dropValue
            if 14 < roofLength < 26:
                if 0.0 < dropValue < 0.2:
                    #pick one with rooflength nearest 20:
                    if topCombo:
                        if (math.fabs(22 - roofLength)) < (
                                math.fabs(22 - topCombo[6])):
                            topCombo = combo
                    else:
                        topCombo = combo

        if topCombo:
            peakDict[CID][1] = topCombo
            bestCombos.append(topCombo)
            print bestCombos[-1]
        else:
            #print 'None'
            pass

    print timer.split()

    #now update predFile (SLOT 13)
    predFile = open(predName, 'r')
    newLines = []
    for line in predFile:
        CID = cg.ss(line)[7]
        if peakDict[CID][1] == 'None':
            peakInfo = 'None'
        else:
            peakInfo = '%s:%s:%s:%s:%s:%s' % (
                str(peakDict[CID][1][1])[-3:], 'S', str(
                    peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],
                peakDict[CID][1][6], peakDict[CID][1][7])
        newLines.append(cg.appendToLine(line, peakInfo, 13))
    predFile.close()

    predFile = open(predName, 'w')
    predFile.writelines(newLines)
    predFile.close()
Пример #17
0
def defineClusters(cName=None):
    #Start Timer
    timer = cg.cgTimer()
    timer.start()

    #Get list of mature tccs
    conf = cgConfig.getConfig(cName)  #passed or default
    finalMirFileName = conf.conf['resultsRaw']
    matureTccs = compare.tccFileToList(finalMirFileName,
                                       1)  # list of all mature micro in tcc
    print 'List getting', timer.split()

    #make connections dict
    matureConnections = compare.makeConnectionsDict(matureTccs)
    print 'Make connections:', timer.split()

    #Now have to define Clusters...
    clusters = []
    addedList = []

    #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P
    def createClusters(item=None, mode=None):

        if item in addedList:
            return 0
        elif mode == "top":
            clusters.append([item])
            addedList.append(
                item)  ##creates new cluster with the item already stored in it
            for connectedItem in matureConnections[item]:
                createClusters(connectedItem, "neighbor")
        elif mode == "neighbor":
            clusters[-1].append(
                item)  #add this item to the last cluster created
            addedList.append(item)
            for connectedItem in matureConnections[item]:
                createClusters(connectedItem, "neighbor")

    for tcc in matureTccs:
        createClusters(tcc, "top")

    print 'Make Clusters', timer.split()

    #Sort Clusters.
    sortedClusters = []

    for cluster in clusters:
        sortedClusters.append(cg.sortTccList(cluster))

    print 'Sort Clusters:', timer.split()

    #Output sorted cluster file
    clusterFileName = conf.conf['sortedClusters']
    clusterFile = open(clusterFileName, 'w')
    for cluster in sortedClusters:
        for hit in cluster:
            clusterFile.write('%s,' % hit)
        clusterFile.write('\n')
    clusterFile.close()
    '''
	#re-create sortedClusters list:
	clusterFileName = 'sortedClusters.data'
	clusterFile = open(clusterFileName, 'r')
	sortedClusters = []
	
	
	for line in clusterFile:
		sortedClusters.append([])
		line = line.strip()[0:-1] #take off last comma ;P
		for hit in (line.strip().split(',')):
			sortedClusters[-1].append(hit)
	'''

    print 'Store intermediate data:', timer.split()

    #output hitsAround file
    outputFile = open(conf.conf['hitsPerFrame'], 'w')

    frameLength = 200
    frameShift = 1
    for cluster in sortedClusters:
        #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope
        clusterChrom = cluster[0].split(":")[0]
        clusterStrand = cluster[0].split(":")[1]
        firstCoord = int(cluster[0].split(":")[2])
        #print cluster[-1]
        lastCoord = int(cluster[-1].split(":")[3])

        startCoord = firstCoord
        while startCoord < lastCoord:
            #count how many hits there are in this range
            rangeStart = startCoord - (frameLength / 2)
            rangeEnd = startCoord + (frameLength / 2)
            rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand,
                                        rangeStart, rangeEnd)
            overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2)
            hitCount = len(overlappedList)

            #output
            outputFile.write('%s\t%s\n' % (rangeTcc, hitCount))
            startCoord = startCoord + frameShift  #check overlap with range
    outputFile.close()

    print 'Output Hits per Frame:', timer.split()
    print 'Overall Time:', timer.report()
Пример #18
0
import bioLibCG

import sys
fN = sys.argv[1]

timer = bioLibCG.cgTimer()
timer.start()

loadTime = 0.0
splitTime = 0.0
f = open(fN, 'r')
for line in f:
        loadTime += timer.split()

        a = line.strip().split('\t')
        b = int(a[0])
        splitTime += timer.split()

print loadTime
print splitTime
Пример #19
0
def scanVectorsHist(tccList, cName):
	'''Given tcc list --> scan wig files and get histogram values
	can be modified to do single/total values...
	THIS USES INDEXES!!! = BAD...'''
	
	conf = c.getConfig(cName)
	org = conf.conf['organism']
	mConf = c.getConfig('Main.conf')
	wigDir = mConf.conf['wig%s' % org]

	
	timer = cg.cgTimer()
	timer.start()
	histDict = {} # tcc: [list values]
	for tcc in tccList:
		theSplit = ss(tcc, ':')
		chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3])
		
		#goto correct fild, correct line in index
		
		fN = wigDir + '/Merge.%s.%s.wig.%s.wig' %  (org.lower(),strand,chrom)
		fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom)
		
		#print timer.split()
		#get line in index file
		iFile = open(fNindex, 'r')
		startByte = 'None'
		for line in iFile:
			beg = int(cg.ss(line)[1])
			end = int(cg.ss(line)[2])
			
			if beg <= tccStart < end:
				startByte = int(cg.ss(line)[0]) 
				#print 'INDEX', line.strip()
				break
		iFile.close()
		
		#print timer.split()
		#grab value
		f = open(fN, 'r')
		f.seek(startByte, 0)
		
		stop = False
		for line in f:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				try:
					histDict[tcc].append(lValue)
				except KeyError: #just for zero...so you don't have to if every time...
					histDict[tcc] = [lValue]
			if stop: break
	
		f.close()
		#print timer.split()
	return histDict