def scanVectorsFile(fN, tccList): '''Given tcc list --> scan wig files and return coord:value... ''' timer = cg.cgTimer() timer.start() coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) #goto correct line in index fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this... fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break #fIndex.close() return coordDict
def svCoord(tccList, config = None): '''Given tcc list --> scan Organism wig files and coord:value... ''' #init config = c.getConfig(config) org = config.conf['organism'] wigDir = config.conf['wigSetDir'] wigSetName = config.conf['wigSetName'] splitIntoChroms = config.conf['wigChromSplit'] if splitIntoChroms == 'True': splitIntoChroms = True else: splitIntoChroms = False coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) if splitIntoChroms: fN = wigDir + '/%s.%s.%s.wig' % (wigSetName, chrom, strand) else: fN = wigDir + '/Merge.%s.%s.wig' % (org.lower(), strand) fIndex = cgIndex.lineIndex(fN, header = True) fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) + 1 #print 'lBeg', lBeg lEnd = int(cg.ss(line)[2]) #print 'lEnd', lEnd #print '--' lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd + 1): coordDict[i] = lValue if stop: break fIndex.close() #close the file and the index after use... return coordDict
def getHairpins(fN): predFile = open(fN, 'r') #populate CID:hairpin range cHairs = {} for line in predFile: #get cluster ID CID = ss(line)[7] hairpin = ss(line)[2] if CID in cHairs: #check if the starts and ends need to be stretched hStart = int(ss(cHairs[CID], ':')[2]) hEnd = int(ss(cHairs[CID], ':')[3]) start = int(ss(hairpin, ':')[2]) end = int(ss(hairpin, ':')[3]) if start < hStart: hStart = start if end > hEnd: hEnd = end cHairs[CID] = '%s:%s:%s:%s' % (ss(hairpin, 1)[0], ss(hairpin, 1)[1], hStart, hEnd) else: cHairs[CID] = hairpin predFile.close() return cHairs
def getHairpins(): predFile = open(conf.conf['resultsSorted'], 'r') #populate CID:hairpin range cHairs = {} for line in predFile: #get cluster ID CID = ss(line)[7] hairpin = ss(line)[2] if CID in cHairs: #check if the starts and ends need to be stretched hStart = int(ss(cHairs[CID], ':')[2]) hEnd = int(ss(cHairs[CID], ':')[3]) start = int(ss(hairpin, ':')[2]) end = int(ss(hairpin, ':')[3]) if start < hStart: hStart = start if end > hEnd: hEnd = end cHairs[CID] = '%s:%s:%s:%s' % (ss(hairpin, 1)[0], ss(hairpin, 1)[1], hStart, hEnd) else: cHairs[CID] = hairpin predFile.close() return cHairs
def getHairpins(fN): predFile = open(fN, "r") # populate CID:hairpin range cHairs = {} for line in predFile: # get cluster ID CID = ss(line)[7] hairpin = ss(line)[2] if CID in cHairs: # check if the starts and ends need to be stretched hStart = int(ss(cHairs[CID], ":")[2]) hEnd = int(ss(cHairs[CID], ":")[3]) start = int(ss(hairpin, ":")[2]) end = int(ss(hairpin, ":")[3]) if start < hStart: hStart = start if end > hEnd: hEnd = end cHairs[CID] = "%s:%s:%s:%s" % (ss(hairpin, 1)[0], ss(hairpin, 1)[1], hStart, hEnd) else: cHairs[CID] = hairpin predFile.close() return cHairs
def scanVectorsOrganism(tccList, config=None): '''Given tcc list --> scan Organism wig files and coord:value... ''' config = c.getConfig(config) coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) #print 'Checking Tcc' org = config.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom) #print 'Checking Index' #goto correct line in index fIndex = cgIndex.lineIndex( fN, header=True ) #!!!there actually is a header...have to deal with this... fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch( tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break return coordDict
def scanVectorsOrganism(tccList, config = None): '''Given tcc list --> scan Organism wig files and coord:value... ''' config = c.getConfig(config) coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) #print 'Checking Tcc' org = config.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(),strand,chrom) #print 'Checking Index' #goto correct line in index fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this... fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break return coordDict
def getAll(chrom, strand, point): fNs = cg.recurseDir(mConf.conf['smallPath'], end='.wig') for file in fNs: if 'WIG' in file: fNs.remove(file) elif file == '/home/chrisgre/smallLibs/WIGS/Merge.mouse.1.wig' or '/home/chrisgre/smallLibs/WIGS/Merge.human.1.wig': fNs.remove(file) for fN in fNs: fStrand = cg.ss(fN, '.')[-2] if str(fStrand) == str(strand): val = getWigValue(chrom, point, fN) if val > 0: print fN, val
def getWigValue(chrom, point, fN): #no strand specification in sequencing point = int(point) #get line in index file #grab value f = open(fN, 'r') f.readline() wigValue = 0 for line in f: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) fChrom = cg.ss(line)[0] if beg <= point < end: if chrom == fChrom: wigValue += float(cg.ss(line)[3].split('.')[0]) break f.close() return wigValue
def getAll(chrom, strand, point): fNs = cg.recurseDir(mConf.conf['smallPath'], end = '.wig') for file in fNs: if 'WIG' in file: fNs.remove(file) elif file == '/home/chrisgre/smallLibs/WIGS/Merge.mouse.1.wig' or '/home/chrisgre/smallLibs/WIGS/Merge.human.1.wig': fNs.remove(file) for fN in fNs: fStrand = cg.ss(fN, '.')[-2] if str(fStrand) == str(strand): val = getWigValue(chrom, point, fN) if val > 0: print fN, val
def getWigValue(chrom, strand, point): '''Uses Byte Indexes''' #no strand specification in sequencing point = int(point) if int(strand) == 1: fN = wigDir + '/Merge.mouse.1.wig.%s.wig' % chrom fNindex = wigDir + '/Merge.mouse.1.wig.%s.wig.index' % chrom else: fN = wigDir + '/Merge.mouse.-1.wig.%s.wig' % chrom fNindex = wigDir + '/Merge.mouse.-1.wig.%s.wig.index' % chrom #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= point < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #grab value f = open(fN, 'r') f.seek(startByte, 0) ''' s = "" i = 0 while i < 20: s += f.read(1) i += 1 print s ''' wigValue = 0 for line in f: #print 'Line:', line.strip() beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= point < end: wigValue += float(cg.ss(line)[3].split('.')[0]) break f.close() return wigValue
def getWigValueLINE(chrom, strand, point): '''Old ONE -> Use "Byte" one''' #no strand specification in sequencing point = int(point) if int(strand) == 1: fN = wigDir + '/Merge.mouse.1.wig.%s.wig' % chrom fNindex = wigDir + '/Merge.mouse.1.wig.%s.wig.index' % chrom else: fN = wigDir + '/Merge.mouse.-1.wig.%s.wig' % chrom fNindex = wigDir + '/Merge.mouse.-1.wig.%s.wig.index' % chrom #get line in index file iFile = open(fNindex, 'r') startLine = 0 for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= point <= end: startLine = int(cg.ss(line)[0]) break iFile.close() #grab value f = open(fN, 'r') i = 0 while i < startLine: f.readline() #skip header and lines till indexed line... i += 1 wigValue = 0 for line in f: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= point < end: wigValue += float(cg.ss(line)[3].split('.')[0]) break f.close() return wigValue
def scanVectorsHist(tccList, cName): '''Given tcc list --> scan wig files and get histogram values can be modified to do single/total values... THIS USES INDEXES!!! = BAD...''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() histDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int( theSplit[2]), int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(), strand, chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): try: histDict[tcc].append(lValue) except KeyError: #just for zero...so you don't have to if every time... histDict[tcc] = [lValue] if stop: break f.close() #print timer.split() return histDict
def intronNoisy(cName=None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init cHairs = getHairpins.getHairpins( conf.conf['resultsIntrons']) #CID: HAIRPIN organism = conf.conf['organism'] exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) slide = 1000 #make prediction overlap hitmap predMap = {} predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) #collapse Overlaps print ' collapsing predictions' predList = compare.collapseOverlaps(predList) print ' collapsing exons' exonList = compare.collapseOverlaps(exonList) #collect levels for each hairpin region cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] chrom = ss(hPin, ':')[0] strand = ss(hPin, ':')[1] start = int(ss(hPin, ':')[2]) end = int(ss(hPin, ':')[3]) scanStart = start - slide scanEnd = end + slide scanRange = [] scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start)) scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd)) print scanRange scanRange = compare.subtractTwoTccLists(scanRange, predList) scanRange = compare.subtractTwoTccLists(scanRange, exonList) levels = [] print ' Retrieving Expression levels:', cg.getTccListTotalLength( scanRange) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j: # add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['intronNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
import bioLibCG as cg import cgConfig as c mConf = c.cgConfig('Main.conf') fileNames = cg.recurseDir(mConf.conf['wigMouse'], end='.wig') for fN in fileNames: file = open(fN, 'r') file.readline() #header #get all points in midpoint form pointsDict = {} for line in file: start = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) point = start + (end - start) / 2 #midpoint pointsDict[point] = int(cg.ss(line)[3].split('.')[0]) file.close() #determine peaks based off of neighbors of each point lowest = pointsDict.keys() lowest.sort() peaks = [] span = 2 #must be > 0 for i in range(span + 1, len(lowest) - span - 1): val = pointsDict[lowest[i]] if val < 5: #minimum continue
if strand not in peaks[chrom]: peaks[chrom][strand] = {} #get peaks and values and put in dictionary pFile = open(pN, 'r') for line in pFile: peaks[chrom][strand][int(line.strip().split('\t')[0])] = int( line.strip().split('\t')[1].split('.')[0]) print timer.split() print 'finding best combos' bestCombos = [] for tcc in tccList: print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: print ' peak added', i tccPeaks.append(i) #get all combos pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: for y in tccPeaks:
import bioLibCG as cg import cgConfig as c mConf = c.cgConfig('Main.conf') fileNames = cg.recurseDir(mConf.conf['wigMouse'], end = '.wig') for fN in fileNames: file = open(fN, 'r') file.readline() #header #get all points in midpoint form pointsDict = {} for line in file: start = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) point = start + (end-start)/2 #midpoint pointsDict[point] = int(cg.ss(line)[3].split('.')[0]) file.close() #determine peaks based off of neighbors of each point lowest = pointsDict.keys() lowest.sort() peaks = [] span = 2 #must be > 0 for i in range(span + 1,len(lowest) - span - 1): val = pointsDict[lowest[i]] if val < 5: #minimum
if strand not in peaks[chrom]: peaks[chrom][strand] = {} #get peaks and values and put in dictionary pFile = open(pN, 'r') for line in pFile: peaks[chrom][strand][int(line.strip().split('\t')[0])] = int(line.strip().split('\t')[1].split('.')[0]) print timer.split() print 'finding best combos' bestCombos = [] for tcc in tccList: print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: print ' peak added', i tccPeaks.append(i) #get all combos pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: for y in tccPeaks:
def findPeaks(pType, cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID],'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538':cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc,x,y,dist,ratio,max,highest,val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def scanVectorsSingleCoord(tccList, cName): '''Given tcc list --> scan wig files and coord:value... ''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() coordDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(),strand,chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break f.close() return coordDict
def findPeaks(pType, cName=None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID], 'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538': cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio=False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc, x, y, dist, ratio, max, highest, val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < ( math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % ( str(peakDict[CID][1][1])[-3:], 'S', str( peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5], peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def intronNoisy(cName = None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init cHairs = getHairpins.getHairpins(conf.conf['resultsIntrons']) #CID: HAIRPIN organism = conf.conf['organism'] exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) slide = 1000 #make prediction overlap hitmap predMap = {} predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) #collapse Overlaps print ' collapsing predictions' predList = compare.collapseOverlaps(predList) print ' collapsing exons' exonList = compare.collapseOverlaps(exonList) #collect levels for each hairpin region cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] chrom = ss(hPin, ':')[0] strand = ss(hPin, ':')[1] start = int(ss(hPin, ':')[2]) end = int(ss(hPin, ':')[3]) scanStart = start - slide scanEnd = end + slide scanRange = [] scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start)) scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd)) print scanRange scanRange = compare.subtractTwoTccLists(scanRange, predList) scanRange = compare.subtractTwoTccLists(scanRange, exonList) levels = [] print ' Retrieving Expression levels:', cg.getTccListTotalLength(scanRange) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j:# add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['intronNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
def scanVectorsSingleCoord(tccList, cName): '''Given tcc list --> scan wig files and coord:value... ''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() coordDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int( theSplit[2]), int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(), strand, chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break f.close() return coordDict
def scanVectorsHist(tccList, cName): '''Given tcc list --> scan wig files and get histogram values can be modified to do single/total values... THIS USES INDEXES!!! = BAD...''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() histDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(),strand,chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): try: histDict[tcc].append(lValue) except KeyError: #just for zero...so you don't have to if every time... histDict[tcc] = [lValue] if stop: break f.close() #print timer.split() return histDict