def makeBins5(fN, fOut, typeFilter): fOut = open(fOut, 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') id = ls[0] type, tcc = ls[1:3] chrom, strand, st, en = bioLibCG.tccSplit(tcc) #only take seqs that are long enough if en - st < 100: continue if not typeFilter in type: continue tccBins = [] #0 is the first nt from the 3' end if strand == '1': for i in range(0, 100): s, e = st + i, st + i tccBins.append(bioLibCG.makeTcc(chrom,strand,s,e)) elif strand == '-1': for i in range(0, 100): s, e = en - i, en - i tccBins.append(bioLibCG.makeTcc(chrom,strand,s,e)) pString = [id] + tccBins fOut.write('\t'.join([str(x) for x in pString]) + '\n')
def collectIntronSeqs(fN, outFN, assembly, amount=100, prime3=True): myG = gf.GenomeFetch(assembly) fOut = open(outFN, 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') if not 'C_INTRON' in ls[1]: continue chrom, strand, start, end = bioLibCG.tccSplit(ls[2]) if end - start < amount: continue if strand == '1': if prime3: tcc = bioLibCG.makeTcc(chrom, strand, end - amount, end) seq = myG.getSequence(tcc) fOut.write('%s\n' % seq[::-1]) else: if prime3: tcc = bioLibCG.makeTcc(chrom, strand, start, start + amount) seq = myG.getSequence(tcc) fOut.write('%s\n' % seq[::-1]) f.close() fOut.close()
def collectIntronSeqs(fN, outFN, assembly, amount = 100, prime3 = True): myG = gf.GenomeFetch(assembly) fOut = open(outFN, 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') if not 'C_INTRON' in ls[1]: continue chrom, strand, start, end = bioLibCG.tccSplit(ls[2]) if end - start < amount: continue if strand == '1': if prime3: tcc = bioLibCG.makeTcc(chrom, strand, end - amount, end) seq = myG.getSequence(tcc) fOut.write('%s\n' % seq[::-1]) else: if prime3: tcc = bioLibCG.makeTcc(chrom, strand, start, start + amount) seq = myG.getSequence(tcc) fOut.write('%s\n' % seq[::-1]) f.close() fOut.close()
def get3UTRFromTranscriptome(tranFN, outFN, wholeGene = False ): fOut = open(outFN, 'w') f = open(tranFN, 'r') for i, line in enumerate(f): ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 if wholeGene: utrTcc = bioLibCG.makeTcc(tChrom, tStrand, tStart, tEnd) fOut.write('%s\n' % utrTcc) continue #5UTR if tStrand == '1': range5 = (tStart, cStart - 1) else: range5 = (cEnd + 1, tEnd) #3UTR if tStrand == '1': range3 = (cEnd + 1, tEnd) else: range3 = (tStart, cStart - 1) utrTcc = bioLibCG.makeTcc(tChrom, tStrand, range3[0], range3[1]) fOut.write('%s\n' % utrTcc) f.close() fOut.close()
def getTccs(fN): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = ls[0], bioLibCG.switchStrandFormat(ls[5]), ls[1], ls[2] print bioLibCG.makeTcc(chrom, strand, start, end)
def transcriptSetOverlapTargets(aDir): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() #create list of unique tccs. uniqTccs = [] for alignment in id_alignment.values(): chrom, strand, start, end = cg.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen tcc = cg.makeTcc(chrom, strand, start, end) if tcc not in uniqTccs: uniqTccs.append(tcc) degTccs = [cg.convertToAS(x) for x in uniqTccs] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) #update for obj in id_alignment.values(): chrom, strand, start, end = cg.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen tcc = cg.makeTcc(chrom, strand, start, end) degTcc = cg.convertToAS(tcc) if degTcc in overlappingDegTccs: obj.transcriptOverlap = True else: obj.transcriptOverlap = False aDC.commit(id_alignment)
def getTccs(fN): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = ls[0], bioLibCG.switchStrandFormat( ls[5]), ls[1], ls[2] print bioLibCG.makeTcc(chrom, strand, start, end)
def addOne(fN): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[0]) start += 1 end += 1 print bioLibCG.makeTcc(chrom,strand,start,end)
def loadEditingSites(fN, nt = 'A'): '''Using our labs format, load the editing site into a list''' cBases = {'A':'T', 'T':'A', 'G':'C', 'C':'G'} f = open(fN, 'r') f.readline() #header eList = [] for line in f: ls = line.strip().split('\t') e = EditingSite() e.chromosome = ls[0] e.coordinate = int(ls[1]) e.gene = ls[3] e.eRatio = ls[6] refBase = ls[2] cBase = cBases[nt] if refBase == cBase: e.strand = '-1' else: e.strand = '1' e.tcc = bioLibCG.makeTcc(e.chromosome, e.strand, e.coordinate, e.coordinate) e.ID = int(ls[13]) eList.append(e) f.close() return eList
def markCenterExpression(aFN, wigDir, rn = None, tn = None): extend = 25 timer = bioLibCG.cgTimer() timer.start() aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'], [rn, tn]) #load expression of degradome wigDict = cgWig.loadWigDict(wigDir) for aID in aNX.centerExpression: aNX.centerExpression[aID] = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID]) offset = aNX.tStart[aID] sLen = aNX.sLength[aID] if strand == '1': start = start - extend + offset end = start + sLen else: end = end + extend - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgWig.getExpressionProfile(scanRange, wigDict) #make sure peak is in the small range peakLevel = aNX.tELevel[aID] peakInRange = (peakLevel in stretch.values()) expressionSum = sum(stretch.values()) sortedKeys = stretch.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0 and peakInRange: sumE = 0.0 for key in sortedKeys[8:12]: sumE += stretch[key] aNX.centerExpression[aID][0] = sumE/expressionSum sumE = 0.0 for key in sortedKeys[7:13]: sumE += stretch[key] aNX.centerExpression[aID][1] = sumE/expressionSum sumE = 0.0 for key in sortedKeys[6:14]: sumE += stretch[key] aNX.centerExpression[aID][2] = sumE/expressionSum aNX.save()
def truncate5(fN): tCount = 0 shortCount = 0 fOut = open(fN + '.trun5', 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') type, tcc = ls[1], ls[2] tCount += 1 c, s, st, en = bioLibCG.tccSplit(tcc) cLen = en - st if cLen < 50: shortCount += 1 continue if s == '1': st = st + 50 elif s == '-1': en = en - 50 else: print 'error' return 1 ls[2] = bioLibCG.makeTcc(c, s, st, en) line = '\t'.join(str(x) for x in [ls[0], ls[1], ls[2]]) + '\n' fOut.write(line) print shortCount, tCount
def makeTranscriptome(tranFN, outFN): p = bioLibCG.cgPrint() p.show = False gf = GenomeFetch.GenomeFetch('hg19') fOut = open(outFN, 'w') f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) exonStarts = [int(x) + 1 for x in ls[8][:-1].split(',')] exonEnds = [int(x) for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) tID = ls[0] gID = ls[10] seqList = [] for eStart, eEnd in exonPairs: tcc = bioLibCG.makeTcc(tChrom, tStrand, eStart, eEnd) seqList.append(gf.getSequence(tcc)) mRNA = ''.join(seqList) #reverse direction if negative strand if tStrand == '-1': mRNA = mRNA[::-1] fOut.write('> %s:%s:%s\n' % (tID, gID, len(mRNA))) fOut.write(mRNA + '\n\n') fOut.close() f.close()
def truncate(fN): tCount = 0 shortCount = 0 fOut = open(fN + '.trun', 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') type, tcc = ls[1], ls[2] tCount += 1 c, s, st, en = bioLibCG.tccSplit(tcc) cLen = en - st if cLen < 50: shortCount += 1 continue if s == '1': en = en - 50 elif s == '-1': st = st + 50 else: print 'error' return 1 ls[2] = bioLibCG.makeTcc(c, s, st, en) line = '\t'.join(str(x) for x in [ls[0], ls[1], ls[2]]) + '\n' fOut.write(line) print shortCount, tCount
def makePeakInputQ(cName, minExpression = 2000): '''Uses shell script and qsub to get peaks quickly''' mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1','-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 for i in rangePoints(1, chromLens[chrom], 30): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) log = 'logs/o-' + str(start) elog = 'logs/e-%s-%s-%s-%s' % (chrom, strand, start, end) subprocess.Popen(['qsub', '-V', '-cwd', '-e', elog, '-o', log, '-l', 'mem=3G', '-l', 'rt=3600', 'q.sh', tcc, cName, str(minExpression)]).wait()
def profileAroundPoint(zeroPoint, span, cName, ratio=False, ratioCoord=None): '''span is +/- that number... if you put in 30, you'll get -29 to 29 around 0 zeroPoint must be in tcc format with point at start ratio will return all points around zero point as a ratio of zero point so zeroPoint = 1.0 and the rest will be fractions of that...''' chrom, strand, zPoint, end = cg.tccSplit(zeroPoint) rStart = zPoint - span rEnd = zPoint + span rTcc = cg.makeTcc(chrom, strand, rStart, rEnd) scanDict = svCoord([rTcc], cName) #reorient so that zero is at zero #find position of zero returnDict = {} if not ratio: for i in range(1 - span, span): returnDict[i] = scanDict[zPoint + i] else: if ratioCoord: zeroVal = scanDict[ratioCoord] else: zeroVal = scanDict[zPoint] if zeroVal == 0: zeroVal = 1 for i in range(1 - span, span): r = float(scanDict[zPoint + i]) / float(zeroVal) returnDict[i] = r return returnDict
def makePeakInputQ(cName, minExpression=2000): '''Uses shell script and qsub to get peaks quickly''' mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1', '-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 for i in rangePoints(1, chromLens[chrom], 30): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) log = 'logs/o-' + str(start) elog = 'logs/e-%s-%s-%s-%s' % (chrom, strand, start, end) subprocess.Popen([ 'qsub', '-V', '-cwd', '-e', elog, '-o', log, '-l', 'mem=3G', '-l', 'rt=3600', 'q.sh', tcc, cName, str(minExpression) ]).wait()
def profileAroundPoint(zeroPoint, span, cName, ratio = False, ratioCoord = None): '''span is +/- that number... if you put in 30, you'll get -29 to 29 around 0 zeroPoint must be in tcc format with point at start ratio will return all points around zero point as a ratio of zero point so zeroPoint = 1.0 and the rest will be fractions of that...''' chrom, strand, zPoint, end = cg.tccSplit(zeroPoint) rStart = zPoint - span rEnd = zPoint + span rTcc = cg.makeTcc(chrom, strand, rStart, rEnd) scanDict = svCoord([rTcc], cName) #reorient so that zero is at zero #find position of zero returnDict = {} if not ratio: for i in range(1-span, span): returnDict[i] = scanDict[zPoint + i] else: if ratioCoord: zeroVal = scanDict[ratioCoord] else: zeroVal = scanDict[zPoint] if zeroVal == 0: zeroVal = 1 for i in range(1-span, span): r = float(scanDict[zPoint + i])/float(zeroVal) returnDict[i] = r return returnDict
def markCenterExpression(aFN, wigDir, rn=None, tn=None): extend = 25 timer = bioLibCG.cgTimer() timer.start() aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'], [rn, tn]) #load expression of degradome wigDict = cgWig.loadWigDict(wigDir) for aID in aNX.centerExpression: aNX.centerExpression[aID] = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID]) offset = aNX.tStart[aID] sLen = aNX.sLength[aID] if strand == '1': start = start - extend + offset end = start + sLen else: end = end + extend - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgWig.getExpressionProfile(scanRange, wigDict) #make sure peak is in the small range peakLevel = aNX.tELevel[aID] peakInRange = (peakLevel in stretch.values()) expressionSum = sum(stretch.values()) sortedKeys = stretch.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0 and peakInRange: sumE = 0.0 for key in sortedKeys[8:12]: sumE += stretch[key] aNX.centerExpression[aID][0] = sumE / expressionSum sumE = 0.0 for key in sortedKeys[7:13]: sumE += stretch[key] aNX.centerExpression[aID][1] = sumE / expressionSum sumE = 0.0 for key in sortedKeys[6:14]: sumE += stretch[key] aNX.centerExpression[aID][2] = sumE / expressionSum aNX.save()
def extendPeakTest(tcc, pRange, minVal, maxAvgNoise, minPeakLength, maxPeakLength, cName): chrom, strand, peakPosition, end = cg.tccSplit(tcc) cProfile = stepVectorScan.profileAroundPoint(tcc, pRange, cName, ratio = True) #extend this peak left and right leftRange = range(1-pRange, 0) rightRange = range(1, pRange) leftRange.reverse() #going from the middle outward #left startFinal = leftRange[-1] for i in leftRange: if cProfile[i] > minVal: print ' extending stretch' else: print ' end of stretch L' startFinal = i + 1 break #right endFinal = rightRange[-1] for i in rightRange: if cProfile[i] > minVal: print ' extending stretch' else: print ' end of stretch R' endFinal = i - 1 break peakLength = endFinal - startFinal + 1 #avg expression around peak check... #get total expression before peak low = startFinal high = endFinal noiseExpression = 0 lowRange = range(1 - pRange, low) highRange = range(high + 1, pRange) totalLength = len(lowRange) + len(highRange) print totalLength, pRange, low, high, lowRange, highRange for i in lowRange: noiseExpression += cProfile[i] for i in highRange: noiseExpression += cProfile[i] try: avgNoise = noiseExpression/float(totalLength) except: return False #filter out peaks that look a certain way. if (minPeakLength < peakLength < maxPeakLength) and (avgNoise < maxAvgNoise): goodTcc = cg.makeTcc(chrom, strand, peakPosition + startFinal, peakPosition + endFinal) print '*KEEPER' return goodTcc else: return False
def getOverlappingElements(self, tcc): '''Given region, Which element (INTRON, EXON, 5UTR, 3UTR)''' overlappingElements = [] try: for utrSegment in self.utr5: utr5Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1]) if bioLibCG.tccOverlap(utr5Tcc, tcc): overlappingElements.append([utrSegment, '5UTR']) except IndexError: pass for exon in self.exonList: exonTcc = bioLibCG.makeTcc(self.chromosome, self.strand, exon[0], exon[1]) #print '@ ', exonTcc, tcc, 'EXON' if bioLibCG.tccOverlap(exonTcc, tcc): overlappingElements.append([exon, 'EXON']) for intron in self.intronList: intronTcc = bioLibCG.makeTcc(self.chromosome, self.strand, intron[0], intron[1]) #print '@ ', intronTcc, tcc, 'INTRON' if bioLibCG.tccOverlap(intronTcc, tcc): overlappingElements.append([intron, 'INTRON']) try: for utrSegment in self.utr3: utr3Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1]) if bioLibCG.tccOverlap(utr3Tcc, tcc): overlappingElements.append([utrSegment, '3UTR']) except IndexError: pass #!!!Eventually add a way to find if overlapping EXON_UTR as well if 'EXON' in overlappingElements and 'INTRON' in overlappingElements: overlappingElements.append('EXON_INTRON') overlappingElements.remove('EXON') overlappingElements.remove('INTRON') return overlappingElements
def getTccFromBowtieLine(line): chrom = line.strip().split('\t')[2] strand = line.strip().split('\t')[1] if strand == '+': strand = '1' else: strand = '-1' start = int(line.strip().split('\t')[3]) end = start + len(line.strip().split('\t')[4]) return cg.makeTcc(chrom, strand, start, end)
def getTccFromBedLine(line): chrom = line.strip().split('\t')[0] strand = line.strip().split('\t')[5] if strand == '+': strand = '1' else: strand = '-1' start = int(line.strip().split('\t')[1]) end = int(line.strip().split('\t')[2]) return cg.makeTcc(chrom, strand, start, end)
def toTcc(fN, strand): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom = ls[0].split(':')[0] start = ls[0].split(':')[1].split('-')[0] end = ls[0].split(':')[1].split('-')[1] newTcc = bioLibCG.makeTcc(chrom, strand, start, end) print newTcc
def getTccFromUCSCLine(line): '''format may change...''' chrom = line.strip().split('\t')[1] strand = line.strip().split('\t')[6] if strand == '+': strand = '1' else: strand = '-1' start = int(line.strip().split('\t')[2]) end = int(line.strip().split('\t')[3]) return cg.makeTcc(chrom, strand, start, end)
def peakToSeq(peakFN, extend, outFN, assembly): #extend is +25 for degradome and -6/-4 for oRNA extend = int(extend) gf = GenomeFetch.GenomeFetch(assembly) outF = open(outFN, 'w') f = open(peakFN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[0]) start, end = start - extend, end + extend newTcc = bioLibCG.makeTcc(chrom, strand, start, end) outF.write(gf.getSequence(newTcc) + '\n')
def plotResults(rFN, smallCName, degCName): f = open(rFN, 'r') i = 1 for line in f: chrom, strand, start, end = bioLibCG.tccSplit(line.strip().split('\t')[0]) start = start - 30 end = end + 30 newTcc = bioLibCG.makeTcc(chrom, strand, start, end) cgPlot.plotSmallDeg(newTcc, smallCName, degCName, 'newResults', line.strip(), str(i)) i += 1
def peakToSeq(peakFN, extend, outFN): #extend is +25 for degradome and -6/-4 for oRNA extend = int(extend) gf = GenomeFetch.GenomeFetch('hg19') outF = open(outFN, 'w') f = open(peakFN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[0]) start, end = start - extend, end + extend newTcc = bioLibCG.makeTcc(chrom, strand, start, end) outF.write(gf.getSequence(newTcc) + '\n')
def plotPairs(oDir, aDir, cName): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for oID, oRNA in id_oRNA.items(): if not oRNA.passedFilter: continue for aID in oRNA.filteredTargets: alignment = id_alignment[aID] chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength print sLen print oRNA.sequence print oRNA.tcc print alignment.tTcc if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen print chrom, strand, start, end scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() xVals = range(1, sLen + 2) xVals = sortedKeys yVals = [stretch.profile[x] for x in sortedKeys] print xVals, len(xVals) print yVals, len(yVals) plt.plot(xVals, yVals) plt.show() return 0
def getOverlappingElements(self, tcc): """Given region, Which element (INTRON, EXON, 5UTR, 3UTR)""" overlappingElements = [] try: for utrSegment in self.utr5: utr5Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1]) if bioLibCG.tccOverlap(utr5Tcc, tcc): overlappingElements.append([utrSegment, "5UTR"]) except IndexError: pass for exon in self.exonList: exonTcc = bioLibCG.makeTcc(self.chromosome, self.strand, exon[0], exon[1]) # print '@ ', exonTcc, tcc, 'EXON' if bioLibCG.tccOverlap(exonTcc, tcc): overlappingElements.append([exon, "EXON"]) for intron in self.intronList: intronTcc = bioLibCG.makeTcc(self.chromosome, self.strand, intron[0], intron[1]) # print '@ ', intronTcc, tcc, 'INTRON' if bioLibCG.tccOverlap(intronTcc, tcc): overlappingElements.append([intron, "INTRON"]) try: for utrSegment in self.utr3: utr3Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1]) if bioLibCG.tccOverlap(utr3Tcc, tcc): overlappingElements.append([utrSegment, "3UTR"]) except IndexError: pass #!!!Eventually add a way to find if overlapping EXON_UTR as well if "EXON" in overlappingElements and "INTRON" in overlappingElements: overlappingElements.append("EXON_INTRON") overlappingElements.remove("EXON") overlappingElements.remove("INTRON") return overlappingElements
def plotASProfile(tcc, cName, directory = None, min = 0, extra = "0"): if not directory: fN = extra + '.' + tcc + '.png' else: fN = directory + '/' + extra + '.' + tcc + '.png' #Get S Profile tccStretch = cgPeaks.stretch(tcc, cName) highest = tccStretch.getHighestLevel() if highest < min: return 0 sortedX = tccStretch.profile.keys() sortedX.sort() sortedY = [] for X in sortedX: sortedY.append(tccStretch.profile[X]) #Get AS Profile chr, strand, start, end = tcc.strip().split(':') if strand == '1': strand = '-1' else: strand = '1' tcc = cg.makeTcc(chr, strand, start, end) tccStretchAS = cgPeaks.stretch(tcc, cName) highest = tccStretchAS.getHighestLevel() if highest < min: return 0 #AS can have minimum I guess... sortedXAS = tccStretchAS.profile.keys() sortedXAS.sort() sortedYAS = [] for X in sortedXAS: sortedYAS.append(tccStretchAS.profile[X]) #Plot them gDevice = importr('grDevices') gDevice.png(file=fN, width=1680, height=1050) r('split.screen(c(2,1))') r('screen(1)') r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "(Syn) Expression Level" ) r.lines(sortedX, sortedY, type = "b") r('screen(2)') r.plot(sortedXAS, sortedYAS, xlab = "Coordinates", ylab = "(Anti) Expression Level") r.lines(sortedXAS, sortedYAS, type = "b") gDevice.dev_off()
def updateSequence(oFN, oFF, extend, assembly): NX = Nexus(oFN, oFF) NX.load(['sequence', 'tcc']) gf = GenomeFetch.GenomeFetch(assembly) while NX.nextID(): chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc) start, end = start - extend, end + extend newTcc = bioLibCG.makeTcc(chrom, strand, start, end) NX.sequence = gf.getSequence(newTcc) NX.save()
def plotResults(rFN, smallCName, degCName): f = open(rFN, 'r') i = 1 for line in f: chrom, strand, start, end = bioLibCG.tccSplit( line.strip().split('\t')[0]) start = start - 30 end = end + 30 newTcc = bioLibCG.makeTcc(chrom, strand, start, end) cgPlot.plotSmallDeg(newTcc, smallCName, degCName, 'newResults', line.strip(), str(i)) i += 1
def countWithBinsSet(dFN, binDir, type = 'INTRON'): dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak) dNX.load(['tcc']) numBins = 1 c_s_bin_set = {} for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): #initialize data structure for i in range(0, numBins): c_s_bin_set.setdefault(chrom, {}).setdefault(strand, {})[i] = set() f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r') for line in f: ls = line.strip().split('\t') tccs = ls[1:numBins + 1] for i in range(0,numBins): ch, st, sta, end = bioLibCG.tccSplit(tccs[i]) for j in range(sta, end + 1): c_s_bin_set[chrom][strand][i].add(j) #collect dTtcs in list dTccs = [] for dID in dNX.tcc: tcc = dNX.tcc[dID] c, s, st, en = bioLibCG.tccSplit(tcc) if s == '1': s = '-1' en = st else: s = '1' st = en dTccs.append(bioLibCG.makeTcc(c,s,st,en)) #make bCounts binCounts = [0] * numBins #count for each bin for i in range(0, numBins): for dTcc in dTccs: c, s, st, en = bioLibCG.tccSplit(dTcc) for j in range(st, en + 1): if j in c_s_bin_set[c][s][i]: binCounts[i] += 1 print '%s\t%s' % (i, binCounts[i])
def countWithBinsSet(dFN, binDir, type='INTRON'): dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak) dNX.load(['tcc']) numBins = 1 c_s_bin_set = {} for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): #initialize data structure for i in range(0, numBins): c_s_bin_set.setdefault(chrom, {}).setdefault(strand, {})[i] = set() f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r') for line in f: ls = line.strip().split('\t') tccs = ls[1:numBins + 1] for i in range(0, numBins): ch, st, sta, end = bioLibCG.tccSplit(tccs[i]) for j in range(sta, end + 1): c_s_bin_set[chrom][strand][i].add(j) #collect dTtcs in list dTccs = [] for dID in dNX.tcc: tcc = dNX.tcc[dID] c, s, st, en = bioLibCG.tccSplit(tcc) if s == '1': s = '-1' en = st else: s = '1' st = en dTccs.append(bioLibCG.makeTcc(c, s, st, en)) #make bCounts binCounts = [0] * numBins #count for each bin for i in range(0, numBins): for dTcc in dTccs: c, s, st, en = bioLibCG.tccSplit(dTcc) for j in range(st, en + 1): if j in c_s_bin_set[c][s][i]: binCounts[i] += 1 print '%s\t%s' % (i, binCounts[i])
def markCenterExpression(aDir, cName): aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): alignment.centerExpression = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] alignment.centerExpression[0] = sum/expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] alignment.centerExpression[1] = sum/expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] alignment.centerExpression[2] = sum/expressionSum aDC.commit(id_alignment)
def markCenterExpression(aDir, cName): aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): alignment.centerExpression = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] alignment.centerExpression[0] = sum / expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] alignment.centerExpression[1] = sum / expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] alignment.centerExpression[2] = sum / expressionSum aDC.commit(id_alignment)
def markCenterExpression(aFN, cName, rn=None, tn=None): aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength'], [rn, tn]) for aID in aNX.centerExpression: aNX.centerExpression[aID] = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID]) offset = aNX.tStart[aID] sLen = aNX.sLength[aID] if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] aNX.centerExpression[aID][0] = sum / expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] aNX.centerExpression[aID][1] = sum / expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] aNX.centerExpression[aID][2] = sum / expressionSum aNX.save()
def profileTargetsHistoAS(tccList, cName, name='boxplot'): range = 50 histDict = {} # {coord: []} histDictAS = {} for tcc in tccList: chrom, strand, start, end = cg.tccSplit(tcc) #Get highest peak (sense) tccStretch = cgPeaks.stretch(tcc, cName) tccStretch.createPeaks(span=2) highestCoord = tccStretch.getHighestPeak() if highestCoord == None: continue #AS tccAS = cg.convertToAS(tcc) tccStretch = cgPeaks.stretch(tccAS, cName) tccStretch.createPeaks(span=2) highestCoordAS = tccStretch.getHighestPeak() if highestCoordAS == None: continue #profile around point (Sense) zPoint = cg.makeTcc(chrom, strand, highestCoord, end) cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio=True) for coord in cProfile: try: histDict[coord].append(cProfile[coord]) except: #quicker way to initialize histDict[coord] = [cProfile[coord]] #profile around point (AS) zPoint = cg.convertToAS(zPoint) cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio=True, ratioCoord=highestCoordAS) for coord in cProfile: try: histDictAS[coord].append(cProfile[coord]) except: #quicker way to initialize histDictAS[coord] = [cProfile[coord]] plot.boxPlotHistoAS(histDict, histDictAS, name=name)
def markCenterExpression(aFN, cName, rn = None, tn = None): aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength'], [rn, tn]) for aID in aNX.centerExpression: aNX.centerExpression[aID] = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID]) offset = aNX.tStart[aID] sLen = aNX.sLength[aID] if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] aNX.centerExpression[aID][0] = sum/expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] aNX.centerExpression[aID][1] = sum/expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] aNX.centerExpression[aID][2] = sum/expressionSum aNX.save()
def getTccFromSamLine(line): '''SAM has odd formatting at top''' try: lineSplit = line.strip().split('\t') chrom = lineSplit[2] strand = lineSplit[1] if strand == '16': strand = '-1' else: strand = '1' start = int(lineSplit[3]) end = start + len(lineSplit[9]) return cg.makeTcc(chrom, strand, start, end) except: print 'Warning: line failed parsing' print line.strip() return None
def locateASignals(dataFN, outFN, rn = None, tn = None): #load data NX = cgNexusFlat.Nexus(dataFN, ASite) NX.load(['coord', 'sequence'], [rn, tn]) f = open(outFN, 'w') for id in NX.ids: chrom, strand, start, end = bioLibCG.tccSplit(NX.coord[id]) if len(NX.sequence[id]) < 10: continue print NX.sequence[id], '\n' checkFrames = bioLibCG.returnFrames(NX.sequence[id], 6) for i, frame in enumerate(checkFrames): if frame == 'AATAAA': #assume 0-based...? siteStart, siteEnd = start + i, start + i + 5 f.write('%s\n' % bioLibCG.makeTcc(chrom, strand, siteStart, siteEnd)) f.close()
def overlapWithDegradome(dFN, eFN): eSites = cgEdit.loadEditingSites(eFN) degTccs = [] f = open(dFN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[1]) start = start - 3 end = end + 3 degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end)) print degTccs[0:5] eTccs = [eSite.tcc for eSite in eSites] overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1) print len(overlaps)
def overlapWithDegradome(dFN, eFN): eSites = cgEdit.loadEditingSites(eFN) degTccs = [] f = open(dFN, "r") for line in f: ls = line.strip().split("\t") chrom, strand, start, end = bioLibCG.tccSplit(ls[1]) start = start - 3 end = end + 3 degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end)) print degTccs[0:5] eTccs = [eSite.tcc for eSite in eSites] overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1) print len(overlaps)
def profileTargetsHistoAS(tccList, cName, name = 'boxplot'): range = 50 histDict = {} # {coord: []} histDictAS = {} for tcc in tccList: chrom, strand, start, end = cg.tccSplit(tcc) #Get highest peak (sense) tccStretch = cgPeaks.stretch(tcc, cName) tccStretch.createPeaks(span = 2) highestCoord = tccStretch.getHighestPeak() if highestCoord == None: continue #AS tccAS = cg.convertToAS(tcc) tccStretch = cgPeaks.stretch(tccAS, cName) tccStretch.createPeaks(span = 2) highestCoordAS = tccStretch.getHighestPeak() if highestCoordAS == None: continue #profile around point (Sense) zPoint = cg.makeTcc(chrom, strand, highestCoord, end) cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio = True) for coord in cProfile: try: histDict[coord].append(cProfile[coord]) except: #quicker way to initialize histDict[coord] = [cProfile[coord]] #profile around point (AS) zPoint = cg.convertToAS(zPoint) cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio = True, ratioCoord = highestCoordAS) for coord in cProfile: try: histDictAS[coord].append(cProfile[coord]) except: #quicker way to initialize histDictAS[coord] = [cProfile[coord]] plot.boxPlotHistoAS(histDict, histDictAS, name = name)