def splitExonsIntrons(cName = None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init organism = conf.conf['organism'] minOverlap = 50 cHairs = getHairpins.getHairpins() #CID: HAIRPIN exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) hairpins = [] for CID in cHairs: hairpins.append(cHairs[CID]) print 'checking overlaps' #check which hairpins overlap exons and by how much exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount = True) print ' ', len(exonOverlapped) print 'removing partial introns' #remove the ones that didn't overlap more than X: remList = [] for tcc, oAmount in exonOverlapped: if oAmount < minOverlap: remList.append([tcc, oAmount]) for item in remList: exonOverlapped.remove(item) print ' ', len(exonOverlapped), 'out of', len(cHairs.keys()) #get CIDs of exons exonCIDs = [] for tcc, oAmount in exonOverlapped: for CID in cHairs: if cHairs[CID] == tcc: exonCIDs.append(str(CID)) #Open sorted predictions and write lines with CIDs to respective files predFile = open(conf.conf['resultsSorted'], 'r') exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w') intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w') for line in predFile: if line.split('\t')[7] in exonCIDs: exonFile.write(line) else: intronFile.write(line) predFile.close() exonFile.close() intronFile.close()
def splitExonsIntrons(cName=None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init organism = conf.conf['organism'] minOverlap = 50 cHairs = getHairpins.getHairpins() #CID: HAIRPIN exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) hairpins = [] for CID in cHairs: hairpins.append(cHairs[CID]) print 'checking overlaps' #check which hairpins overlap exons and by how much exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount=True) print ' ', len(exonOverlapped) print 'removing partial introns' #remove the ones that didn't overlap more than X: remList = [] for tcc, oAmount in exonOverlapped: if oAmount < minOverlap: remList.append([tcc, oAmount]) for item in remList: exonOverlapped.remove(item) print ' ', len(exonOverlapped), 'out of', len(cHairs.keys()) #get CIDs of exons exonCIDs = [] for tcc, oAmount in exonOverlapped: for CID in cHairs: if cHairs[CID] == tcc: exonCIDs.append(str(CID)) #Open sorted predictions and write lines with CIDs to respective files predFile = open(conf.conf['resultsSorted'], 'r') exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w') intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w') for line in predFile: if line.split('\t')[7] in exonCIDs: exonFile.write(line) else: intronFile.write(line) predFile.close() exonFile.close() intronFile.close()
def plotResults(fN, cName = None): cHairs = getHairpins.getHairpins(fN) #CID: HAIRPIN directory = cg.getBaseFileName(fN) cg.clearDirectory(directory) #change the directory before plotting cwd = os.getcwd() os.chdir(directory) for CID in cHairs: print 'plotting:', CID cgPlot.plotASProfile(cHairs[CID], cName) os.chdir(cwd)
def plotResults(fN, cName=None): cHairs = getHairpins.getHairpins(fN) #CID: HAIRPIN directory = cg.getBaseFileName(fN) cg.clearDirectory(directory) #change the directory before plotting cwd = os.getcwd() os.chdir(directory) for CID in cHairs: print 'plotting:', CID cgPlot.plotASProfile(cHairs[CID], cName) os.chdir(cwd)
import getHairpins import cgGenes import cgConfig as c import bioLibCG as cg mConf = c.getConfig('Main.conf') geneSetFolder = mConf.conf['geneSetsHuman'] fN = '/home/chrisgre/projects/NoncodingHuman/results/NChuman-s3k8b17.results.sorted.introns.sorted' cHairs = getHairpins.getHairpins(fN) ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllTranscripts.tsv') cDesc = {} #CID:gDesc for CID in cHairs: tcc = cHairs[CID] cDesc[CID] = "NONE" overlappingGenes = ensGenes.geneOverlaps([tcc]) if len(overlappingGenes) > 0: print overlappingGenes[0].type cDesc[CID] = overlappingGenes[0].type f = open(fN, 'r') newLines = [] for line in f: CID = line.strip().split('\t')[7] newLines.append(cg.appendToLine(line, cDesc[CID], 16)) f.close()
#given tcc, return best peak combo import bioLibCG as cg import cgConfig as c import wigValue import compareData as compare import getHairpins import math #init mConf = c.cgConfig('Main.conf') conf = c.cgConfig() predName = conf.conf['resultsExonsSorted'] #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID],'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'loading peak data' peakFilesNames = cg.recurseDir(mConf.conf['wigMouse'], end = '.peaks') peaks = {} # chr:peak:value for pN in peakFilesNames: chrom = pN.strip().split('.')[4] strand = pN.strip().split('.')[2]
def findPeaks(pType, cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID],'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538':cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc,x,y,dist,ratio,max,highest,val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
#given tcc, return best peak combo import bioLibCG as cg import cgConfig as c import wigValue import compareData as compare import getHairpins import math #init mConf = c.cgConfig('Main.conf') conf = c.cgConfig() predName = conf.conf['resultsExonsSorted'] #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID], 'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'loading peak data' peakFilesNames = cg.recurseDir(mConf.conf['wigMouse'], end='.peaks') peaks = {} # chr:peak:value for pN in peakFilesNames: chrom = pN.strip().split('.')[4] strand = pN.strip().split('.')[2] #init dictionary
def intronNoisy(cName = None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init cHairs = getHairpins.getHairpins(conf.conf['resultsIntrons']) #CID: HAIRPIN organism = conf.conf['organism'] exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) slide = 1000 #make prediction overlap hitmap predMap = {} predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) #collapse Overlaps print ' collapsing predictions' predList = compare.collapseOverlaps(predList) print ' collapsing exons' exonList = compare.collapseOverlaps(exonList) #collect levels for each hairpin region cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] chrom = ss(hPin, ':')[0] strand = ss(hPin, ':')[1] start = int(ss(hPin, ':')[2]) end = int(ss(hPin, ':')[3]) scanStart = start - slide scanEnd = end + slide scanRange = [] scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start)) scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd)) print scanRange scanRange = compare.subtractTwoTccLists(scanRange, predList) scanRange = compare.subtractTwoTccLists(scanRange, exonList) levels = [] print ' Retrieving Expression levels:', cg.getTccListTotalLength(scanRange) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j:# add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['intronNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
def findPeaks(pType, cName=None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID], 'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538': cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio=False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc, x, y, dist, ratio, max, highest, val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < ( math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % ( str(peakDict[CID][1][1])[-3:], 'S', str( peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5], peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def exonNoisy(cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) cHairs = getHairpins.getHairpins(conf.conf['resultsExons']) #CID: HAIRPIN organism = conf.conf['organism'] geneSetFolder = mConf.conf['geneSets%s' % organism] #make prediction overlap hitmap print 'Making prediction list' predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) if compare.checkIfOverlaps(predList): predList = compare.collapseOverlaps(predList) #make genes for Ensemble/make list of tccs for exons. print 'Creating gene set' ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllExons.tsv') print ' loaded # genes:', len(ensGenes.set) #collect levels for each haipin region print '[Checking all levels]' cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] #for each hairpin, --> find overlapping transcripts in same gene overlappingGenes = ensGenes.geneOverlaps([hPin]) if len(overlappingGenes) > 0: gIDs = [gene.id for gene in overlappingGenes] allTccs = ensGenes.getTccsFromGIDs(gIDs) if compare.checkIfOverlaps: print ' Overlaps...collapsing' allTccs = compare.collapseOverlaps(allTccs) else: print 'NO GENE OVERLAPS!!!!!', CID, hPin #filter out my predictions. print ' Filtering out predictions' checkList = compare.subtractTwoTccLists(allTccs, predList) #Get Expression level for gene. print ' Retrieving Expression levels:', cg.getTccListTotalLength(checkList) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(checkList, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file print 'Outputting to file' #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() #print sortedKeys newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j:# add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['exonNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
def intronNoisy(cName=None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init cHairs = getHairpins.getHairpins( conf.conf['resultsIntrons']) #CID: HAIRPIN organism = conf.conf['organism'] exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) slide = 1000 #make prediction overlap hitmap predMap = {} predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) #collapse Overlaps print ' collapsing predictions' predList = compare.collapseOverlaps(predList) print ' collapsing exons' exonList = compare.collapseOverlaps(exonList) #collect levels for each hairpin region cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] chrom = ss(hPin, ':')[0] strand = ss(hPin, ':')[1] start = int(ss(hPin, ':')[2]) end = int(ss(hPin, ':')[3]) scanStart = start - slide scanEnd = end + slide scanRange = [] scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start)) scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd)) print scanRange scanRange = compare.subtractTwoTccLists(scanRange, predList) scanRange = compare.subtractTwoTccLists(scanRange, exonList) levels = [] print ' Retrieving Expression levels:', cg.getTccListTotalLength( scanRange) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j: # add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['intronNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()