def transcriptSetOverlapTargets(aDir): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() #create list of unique tccs. uniqTccs = [] for alignment in id_alignment.values(): chrom, strand, start, end = cg.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen tcc = cg.makeTcc(chrom, strand, start, end) if tcc not in uniqTccs: uniqTccs.append(tcc) degTccs = [cg.convertToAS(x) for x in uniqTccs] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) #update for obj in id_alignment.values(): chrom, strand, start, end = cg.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen tcc = cg.makeTcc(chrom, strand, start, end) degTcc = cg.convertToAS(tcc) if degTcc in overlappingDegTccs: obj.transcriptOverlap = True else: obj.transcriptOverlap = False aDC.commit(id_alignment)
def transcriptSetOverlapDegFileHitmap(degFile, runningChrom, runningStrand): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) transcriptTccs = [] for gene in allExons.set.values(): for transcript in gene.transcripts: transcriptTccs.append(transcript.tcc) #create hitmap coordSet = set() for tcc in transcriptTccs: chrom, strand, start, end = cg.tccSplit(tcc) if chrom != runningChrom: continue if strand != runningStrand: continue for i in range(start, end + 1): coordSet.add(i) #find overlapping degTccs print 'done creating hitmap' f = open(degFile, 'r') newLines = [] for line in f: ls = line.strip().split('\t') degTcc = cg.convertToAS(ls[1]) chrom, strand, start, end = cg.tccSplit(degTcc) if chrom != runningChrom: continue if strand != runningStrand: continue inTran = '0' for i in xrange(start, end + 1): if i in coordSet: inTran = '1' break #update newLines newLine = cg.appendToLine(line, inTran, 3) newLines.append(newLine) f.close() f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w') f.writelines(newLines) f.close()
def countWithBinsSet(dFN, binDir, type = 'INTRON'): dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak) dNX.load(['tcc']) numBins = 1 c_s_bin_set = {} for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): #initialize data structure for i in range(0, numBins): c_s_bin_set.setdefault(chrom, {}).setdefault(strand, {})[i] = set() f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r') for line in f: ls = line.strip().split('\t') tccs = ls[1:numBins + 1] for i in range(0,numBins): ch, st, sta, end = bioLibCG.tccSplit(tccs[i]) for j in range(sta, end + 1): c_s_bin_set[chrom][strand][i].add(j) #collect dTtcs in list dTccs = [] for dID in dNX.tcc: tcc = dNX.tcc[dID] c, s, st, en = bioLibCG.tccSplit(tcc) if s == '1': s = '-1' en = st else: s = '1' st = en dTccs.append(bioLibCG.makeTcc(c,s,st,en)) #make bCounts binCounts = [0] * numBins #count for each bin for i in range(0, numBins): for dTcc in dTccs: c, s, st, en = bioLibCG.tccSplit(dTcc) for j in range(st, en + 1): if j in c_s_bin_set[c][s][i]: binCounts[i] += 1 print '%s\t%s' % (i, binCounts[i])
def countWithBinsSet(dFN, binDir, type='INTRON'): dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak) dNX.load(['tcc']) numBins = 1 c_s_bin_set = {} for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): #initialize data structure for i in range(0, numBins): c_s_bin_set.setdefault(chrom, {}).setdefault(strand, {})[i] = set() f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r') for line in f: ls = line.strip().split('\t') tccs = ls[1:numBins + 1] for i in range(0, numBins): ch, st, sta, end = bioLibCG.tccSplit(tccs[i]) for j in range(sta, end + 1): c_s_bin_set[chrom][strand][i].add(j) #collect dTtcs in list dTccs = [] for dID in dNX.tcc: tcc = dNX.tcc[dID] c, s, st, en = bioLibCG.tccSplit(tcc) if s == '1': s = '-1' en = st else: s = '1' st = en dTccs.append(bioLibCG.makeTcc(c, s, st, en)) #make bCounts binCounts = [0] * numBins #count for each bin for i in range(0, numBins): for dTcc in dTccs: c, s, st, en = bioLibCG.tccSplit(dTcc) for j in range(st, en + 1): if j in c_s_bin_set[c][s][i]: binCounts[i] += 1 print '%s\t%s' % (i, binCounts[i])
def testOverlaps(dataFN, oFF): dataNX = Nexus(dataFN, oFF) dataNX.load(['tcc']) #check for overlaps overlappingIDs = set() chrom_strand_range = {} while dataNX.nextID(): chrom, strand, start, end = bioLibCG.tccSplit(dataNX.tcc) #check if overlap chrom_strand_range.setdefault(chrom, {}).setdefault(strand, set()) overlap = False for i in range(start, end + 1): if i in chrom_strand_range[chrom][strand]: overlap = True break #tag or add these coordinates if overlap: overlappingIDs.add(dataNX.id) else: for i in range(start, end + 1): chrom_strand_range[chrom][strand].add(i) print "THESE OVERLAP", overlappingIDs
def updateContext(oFN, wigDir, chrom, strand, switchStrand = False): oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak) oNX.load(['tcc', 'context']) if switchStrand: strand = str(-int(strand)) else: strand = str(strand) print 'loading wig' coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') print 'done loading' ds = bioLibCG.dominantSpotter(['C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER']) for oID in oNX.tcc: oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID]) #deg wigs is AS to actual clipping site if switchStrand: oStrand = str(-int(strand)) else: oStrand = str(oStrand) if oChrom == chrom and oStrand == strand: contexts = coord_contexts.get(start, 'INTER').split(',') oNX.context[oID] = ds.spotItem(contexts) oNX.save()
def updateContext(oFN, wigDir, chrom, strand, rn=None, tn=None): oNX = cgNexusFlat.Nexus(oFN, degPeak.degPeak) oNX.load(['context', 'tcc'], [rn, tn]) print 'loading wig' coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') print 'done loading' ds = bioLibCG.dominantSpotter([ 'C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER' ]) for oID in oNX.tcc: oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID]) #deg wigs is AS to actual clipping site if oStrand == '1': oStrand = '-1' else: oStrand = '1' if oChrom == chrom and oStrand == strand: contexts = coord_contexts.get(start, 'INTER').split(',') oNX.context[oID] = ds.spotItem(contexts) oNX.save()
def newCoords(fN): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[0]) print '%s:%s-%s' % (chrom, start, end)
def updateIContext(oFN, wigDir, chrom, strand, rn = None, tn = None): oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak) oNX.load(['tcc', 'iContexts'], [rn, tn]) if strand == '1': strand = '-1' else: strand = '1' print 'loading wig' coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'iContext') print 'done loading' for oID in oNX.tcc: oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID]) #deg wigs is AS to actual clipping site if oStrand == '1': oStrand = '-1' else: oStrand = '1' if oChrom == chrom and oStrand == strand: contexts = coord_contexts.get(start, '-1').split(',') contexts = [int(x) for x in contexts] oNX.iContexts[oID] = contexts oNX.save()
def getPlotData(aSites, wigDir, outFN): '''get box plot data from sites in degradome''' #load and init spreadRange = range(-200, 201) #200 +/- ... might want to check distance each AAUAAA is from each other relCoord_degVals = dict( (i, []) for i in spreadRange ) for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): print chrom, strand coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'ALL') f = open(aSites, 'r') for line in f: ls = line.strip().split('\t') ichrom, istrand, start, end = bioLibCG.tccSplit(ls[0]) if ichrom != chrom or istrand != strand: continue for i in spreadRange: degVal = coord_value.get(end + i, 0) relCoord_degVals[i].append(degVal) f.close() #output box data #each row is a histogram of spread position (e.g., first row is -200) f = open(outFN, 'w') outLines = [] for i in spreadRange: l = [str(x) for x in relCoord_degVals[i]] outLines.append('\t'.join(l) + '\n') f.writelines(outLines) f.close()
def returnContBlocks(profile, tcc, minLevel = 5): chrom, strand, start, end = cg.tccSplit(tcc) pCoords = profile.keys() pCoords.sort() inBlock = False bStart = None blocks = [] for pCoord in pCoords: if int(profile[pCoord]) > minLevel: #expression's high enough if inBlock: continue else: #block Start bStart = pCoord inBlock = True else: #not high enough if inBlock: #end the block blocks.append('%s:%s:%s:%s' % (chrom, strand, bStart, pCoord - 1)) inBlock = False else: continue return blocks
def markCenterExpression(aFN, wigDir, rn = None, tn = None): extend = 25 timer = bioLibCG.cgTimer() timer.start() aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'], [rn, tn]) #load expression of degradome wigDict = cgWig.loadWigDict(wigDir) for aID in aNX.centerExpression: aNX.centerExpression[aID] = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID]) offset = aNX.tStart[aID] sLen = aNX.sLength[aID] if strand == '1': start = start - extend + offset end = start + sLen else: end = end + extend - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgWig.getExpressionProfile(scanRange, wigDict) #make sure peak is in the small range peakLevel = aNX.tELevel[aID] peakInRange = (peakLevel in stretch.values()) expressionSum = sum(stretch.values()) sortedKeys = stretch.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0 and peakInRange: sumE = 0.0 for key in sortedKeys[8:12]: sumE += stretch[key] aNX.centerExpression[aID][0] = sumE/expressionSum sumE = 0.0 for key in sortedKeys[7:13]: sumE += stretch[key] aNX.centerExpression[aID][1] = sumE/expressionSum sumE = 0.0 for key in sortedKeys[6:14]: sumE += stretch[key] aNX.centerExpression[aID][2] = sumE/expressionSum aNX.save()
def collectIntronSeqs(fN, outFN, assembly, amount=100, prime3=True): myG = gf.GenomeFetch(assembly) fOut = open(outFN, 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') if not 'C_INTRON' in ls[1]: continue chrom, strand, start, end = bioLibCG.tccSplit(ls[2]) if end - start < amount: continue if strand == '1': if prime3: tcc = bioLibCG.makeTcc(chrom, strand, end - amount, end) seq = myG.getSequence(tcc) fOut.write('%s\n' % seq[::-1]) else: if prime3: tcc = bioLibCG.makeTcc(chrom, strand, start, start + amount) seq = myG.getSequence(tcc) fOut.write('%s\n' % seq[::-1]) f.close() fOut.close()
def updateHitMap(chrom_strand_coord, tcc): chrom, strand, start, end = bioLibCG.tccSplit(tcc) for i in xrange(start, end + 1): chrom_strand_coord.setdefault(chrom, {}).setdefault(strand, set()).add(i)
def probe(tcc, conf = None): if not conf: mConf = c.cgConfig('Main.conf') smallPath = mConf.conf['smallPath'] chrom, strand, start, end = cg.tccSplit(tcc) total = 0 for lib in cg.recurseDir(smallPath, end = 'mapped.%s.wig' % strand): try: eLevels = stepVectorScan.scanVectorsFile(lib, [tcc]) except: print lib, 'index failed' continue #find highest expression level highest = 0 for coord in eLevels: if eLevels[coord] > highest: highest = eLevels[coord] if highest > 0: print lib, highest total += highest #print eLevels print total
def getNumberContextPosition(iContextDir, outFN, cType = 'INTRON', prime5 = True, ): #load all context NXs print 'loading NXs' chrom_strand_NX = {} for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): iFN = iContextDir + '/%s.%s.ids' % (chrom, strand) NX = cgNexusFlat.Nexus(iFN, cgIContext.IContext) NX.load(['type', 'tcc']) chrom_strand_NX.setdefault(chrom, {})[strand] = NX print 'done...' print 'getting context occupancy' #get the amount of introns/etc that occupy X pos_numIntrons = {} for chrom in chrom_strand_NX: for strand in chrom_strand_NX[chrom]: NX = chrom_strand_NX[chrom][strand] for id in NX.ids: if cType not in NX.type[id]: continue c, s, st, en = bioLibCG.tccSplit(NX.tcc[id]) iLen = en - st for i in xrange(0, iLen): pos_numIntrons[i] = pos_numIntrons.get(i, 0) + 1 fOut = open(outFN, 'w') for pos, numIntrons in pos_numIntrons.iteritems(): fOut.write('%s\t%s\n' % (pos, numIntrons) ) fOut.close()
def collectIntronSeqs(fN, outFN, assembly, amount = 100, prime3 = True): myG = gf.GenomeFetch(assembly) fOut = open(outFN, 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') if not 'C_INTRON' in ls[1]: continue chrom, strand, start, end = bioLibCG.tccSplit(ls[2]) if end - start < amount: continue if strand == '1': if prime3: tcc = bioLibCG.makeTcc(chrom, strand, end - amount, end) seq = myG.getSequence(tcc) fOut.write('%s\n' % seq[::-1]) else: if prime3: tcc = bioLibCG.makeTcc(chrom, strand, start, start + amount) seq = myG.getSequence(tcc) fOut.write('%s\n' % seq[::-1]) f.close() fOut.close()
def updateTranscriptOverlap(oFN, wigDir, chrom, strand, rn=None, tn=None): oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak) oNX.load(['tOverlap', 'tcc'], [rn, tn]) #load the AS wig file for this degradome strand if strand == '1': strand = '-1' else: strand = '1' coord_transcripts = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, 'transcript') for oID in oNX.tOverlap: tChrom, tStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID]) if tStrand == '1': tStrand = '-1' else: tStrand = '1' if tChrom != chrom or tStrand != strand: continue oNX.tOverlap[oID] = False for i in xrange(start, end + 1): if i in coord_transcripts: oNX.tOverlap[oID] = True break oNX.save()
def probe(tcc, conf=None): if not conf: mConf = c.cgConfig('Main.conf') smallPath = mConf.conf['smallPath'] chrom, strand, start, end = cg.tccSplit(tcc) total = 0 for lib in cg.recurseDir(smallPath, end='mapped.%s.wig' % strand): try: eLevels = stepVectorScan.scanVectorsFile(lib, [tcc]) except: print lib, 'index failed' continue #find highest expression level highest = 0 for coord in eLevels: if eLevels[coord] > highest: highest = eLevels[coord] if highest > 0: print lib, highest total += highest #print eLevels print total
def updateGeneName(dFN, fFN, wigDir, chrom, strand, prefix, switchStrand=False): NX = Nexus(dFN, fFN) NX.load(['geneNames', 'tcc']) if switchStrand: strand = -strand strand = str(strand) coord_gName = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, prefix) while NX.nextID(): chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc) overlappingGenes = coord_gName.get(start, ".") if overlappingGenes == "NONE": NX.geneNames = [] else: NX.geneNames = overlappingGenes.split(',') NX.save()
def updateTranscriptOverlap(oFN, wigDir, chrom, strand, rn = None, tn = None): oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak) oNX.load(['tOverlap', 'tcc'], [rn, tn]) #load the AS wig file for this degradome strand if strand == '1': strand = '-1' else: strand = '1' coord_transcripts = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, 'transcript') for oID in oNX.tOverlap: tChrom, tStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID]) if tStrand == '1': tStrand = '-1' else: tStrand = '1' if tChrom != chrom or tStrand != strand: continue oNX.tOverlap[oID] = False for i in xrange(start, end + 1): if i in coord_transcripts: oNX.tOverlap[oID] = True break oNX.save()
def updateContext(oFN, wigDir, chrom, strand, rn = None, tn = None): oNX = cgNexusFlat.Nexus(oFN, degPeak.degPeak) oNX.load(['context', 'tcc'], [rn, tn]) print 'loading wig' coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') print 'done loading' ds = bioLibCG.dominantSpotter(['C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER']) for oID in oNX.tcc: oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID]) #deg wigs is AS to actual clipping site if oStrand == '1': oStrand = '-1' else: oStrand = '1' if oChrom == chrom and oStrand == strand: contexts = coord_contexts.get(start, 'INTER').split(',') oNX.context[oID] = ds.spotItem(contexts) oNX.save()
def scanVectorsFile(fN, tccList): '''Given tcc list --> scan wig files and return coord:value... ''' timer = cg.cgTimer() timer.start() coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) #goto correct line in index fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this... fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break #fIndex.close() return coordDict
def makeWig(fN, assembly, format=None, name=None): '''format assumes bowtie suitible for medium mapped files. takes longer.''' #assume bowtie if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) if not name: name = cg.getBaseFileName(fN, naked=True) lDict = cg.returnChromLengthDict(assembly) for chrom in lDict: if not chrom in cg.acceptableChroms: continue for strand in ['1', '-1']: f = open(fN, 'r') #create hitmap of chrom and strand print chrom, strand, 'hitmap' hitDict = {} for line in f: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) lStrand = str(lStrand) start = int(start) end = int(end) if chrom == lChrom and strand == lStrand: for i in range(start, end + 1): try: hitDict[i] += 1 except KeyError: hitDict[i] = 1 #write results to wig file writeWigFromHitDict(hitDict, assembly)
def updateContext(fN, fF, wigDir, chrom, strand, switchStrand = False): NX = Nexus(fN, fF) NX.load(['tcc', 'context']) if switchStrand: strand = str(-int(strand)) else: strand = str(strand) print 'loading wig' coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') print 'done loading' ds = bioLibCG.dominantSpotter(['C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER']) while NX.nextID(): oChrom, oStrand, start, end = bioLibCG.tccSplit(NX.tcc) #deg wigs is AS to actual clipping site if switchStrand: oStrand = str(-int(strand)) else: oStrand = str(oStrand) if oChrom == chrom and oStrand == strand: contexts = coord_contexts.get(start, 'INTER').split(',') NX.context = ds.spotItem(contexts) NX.save()
def profileAroundPoint(zeroPoint, span, cName, ratio=False, ratioCoord=None): '''span is +/- that number... if you put in 30, you'll get -29 to 29 around 0 zeroPoint must be in tcc format with point at start ratio will return all points around zero point as a ratio of zero point so zeroPoint = 1.0 and the rest will be fractions of that...''' chrom, strand, zPoint, end = cg.tccSplit(zeroPoint) rStart = zPoint - span rEnd = zPoint + span rTcc = cg.makeTcc(chrom, strand, rStart, rEnd) scanDict = svCoord([rTcc], cName) #reorient so that zero is at zero #find position of zero returnDict = {} if not ratio: for i in range(1 - span, span): returnDict[i] = scanDict[zPoint + i] else: if ratioCoord: zeroVal = scanDict[ratioCoord] else: zeroVal = scanDict[zPoint] if zeroVal == 0: zeroVal = 1 for i in range(1 - span, span): r = float(scanDict[zPoint + i]) / float(zeroVal) returnDict[i] = r return returnDict
def profileAroundPoint(zeroPoint, span, cName, ratio = False, ratioCoord = None): '''span is +/- that number... if you put in 30, you'll get -29 to 29 around 0 zeroPoint must be in tcc format with point at start ratio will return all points around zero point as a ratio of zero point so zeroPoint = 1.0 and the rest will be fractions of that...''' chrom, strand, zPoint, end = cg.tccSplit(zeroPoint) rStart = zPoint - span rEnd = zPoint + span rTcc = cg.makeTcc(chrom, strand, rStart, rEnd) scanDict = svCoord([rTcc], cName) #reorient so that zero is at zero #find position of zero returnDict = {} if not ratio: for i in range(1-span, span): returnDict[i] = scanDict[zPoint + i] else: if ratioCoord: zeroVal = scanDict[ratioCoord] else: zeroVal = scanDict[zPoint] if zeroVal == 0: zeroVal = 1 for i in range(1-span, span): r = float(scanDict[zPoint + i])/float(zeroVal) returnDict[i] = r return returnDict
def returnContBlocks(profile, tcc, minLevel=5): chrom, strand, start, end = cg.tccSplit(tcc) pCoords = profile.keys() pCoords.sort() inBlock = False bStart = None blocks = [] for pCoord in pCoords: if int(profile[pCoord]) > minLevel: #expression's high enough if inBlock: continue else: #block Start bStart = pCoord inBlock = True else: #not high enough if inBlock: #end the block blocks.append('%s:%s:%s:%s' % (chrom, strand, bStart, pCoord - 1)) inBlock = False else: continue return blocks
def makeBins5(fN, fOut, typeFilter): fOut = open(fOut, 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') id = ls[0] type, tcc = ls[1:3] chrom, strand, st, en = bioLibCG.tccSplit(tcc) #only take seqs that are long enough if en - st < 100: continue if not typeFilter in type: continue tccBins = [] #0 is the first nt from the 3' end if strand == '1': for i in range(0, 100): s, e = st + i, st + i tccBins.append(bioLibCG.makeTcc(chrom,strand,s,e)) elif strand == '-1': for i in range(0, 100): s, e = en - i, en - i tccBins.append(bioLibCG.makeTcc(chrom,strand,s,e)) pString = [id] + tccBins fOut.write('\t'.join([str(x) for x in pString]) + '\n')
def makeWig(fN, assembly, format = None, name = None): '''format assumes bowtie suitible for medium mapped files. takes longer.''' #assume bowtie if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) if not name: name = cg.getBaseFileName(fN, naked = True) lDict = cg.returnChromLengthDict(assembly) for chrom in lDict: if not chrom in cg.acceptableChroms: continue for strand in ['1', '-1']: f = open(fN, 'r') #create hitmap of chrom and strand print chrom, strand, 'hitmap' hitDict = {} for line in f: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) lStrand = str(lStrand) start = int(start) end = int(end) if chrom == lChrom and strand == lStrand: for i in range(start, end + 1): try: hitDict[i] += 1 except KeyError: hitDict[i] = 1 #write results to wig file writeWigFromHitDict(hitDict, assembly)
def truncate(fN): tCount = 0 shortCount = 0 fOut = open(fN + '.trun', 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') type, tcc = ls[1], ls[2] tCount += 1 c, s, st, en = bioLibCG.tccSplit(tcc) cLen = en - st if cLen < 50: shortCount += 1 continue if s == '1': en = en - 50 elif s == '-1': st = st + 50 else: print 'error' return 1 ls[2] = bioLibCG.makeTcc(c, s, st, en) line = '\t'.join(str(x) for x in [ls[0], ls[1], ls[2]]) + '\n' fOut.write(line) print shortCount, tCount
def truncate5(fN): tCount = 0 shortCount = 0 fOut = open(fN + '.trun5', 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') type, tcc = ls[1], ls[2] tCount += 1 c, s, st, en = bioLibCG.tccSplit(tcc) cLen = en - st if cLen < 50: shortCount += 1 continue if s == '1': st = st + 50 elif s == '-1': en = en - 50 else: print 'error' return 1 ls[2] = bioLibCG.makeTcc(c, s, st, en) line = '\t'.join(str(x) for x in [ls[0], ls[1], ls[2]]) + '\n' fOut.write(line) print shortCount, tCount
def markCenterExpression(aFN, wigDir, rn=None, tn=None): extend = 25 timer = bioLibCG.cgTimer() timer.start() aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'], [rn, tn]) #load expression of degradome wigDict = cgWig.loadWigDict(wigDir) for aID in aNX.centerExpression: aNX.centerExpression[aID] = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID]) offset = aNX.tStart[aID] sLen = aNX.sLength[aID] if strand == '1': start = start - extend + offset end = start + sLen else: end = end + extend - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgWig.getExpressionProfile(scanRange, wigDict) #make sure peak is in the small range peakLevel = aNX.tELevel[aID] peakInRange = (peakLevel in stretch.values()) expressionSum = sum(stretch.values()) sortedKeys = stretch.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0 and peakInRange: sumE = 0.0 for key in sortedKeys[8:12]: sumE += stretch[key] aNX.centerExpression[aID][0] = sumE / expressionSum sumE = 0.0 for key in sortedKeys[7:13]: sumE += stretch[key] aNX.centerExpression[aID][1] = sumE / expressionSum sumE = 0.0 for key in sortedKeys[6:14]: sumE += stretch[key] aNX.centerExpression[aID][2] = sumE / expressionSum aNX.save()
def testPeaks(degFN, dForm, allGeneInfo, gForm, switchStrand = False): #load/configure gene Info gNX = Nexus(allGeneInfo, gForm) gNX.load(['geneName', 'numReads', 'numSpots']) gName_numReads = {} gName_numSpots = {} while gNX.nextID(): gName_numReads[gNX.geneName] = gNX.numReads gName_numSpots[gNX.geneName] = gNX.numSpots #load degFN info dNX = Nexus(degFN, dForm) dNX.load(['tcc', 'eLevel', 'geneNames', 'pValBin']) while dNX.nextID(): gNames, readsForPeak = dNX.geneNames, dNX.eLevel chrom, strand, start, end = bioLibCG.tccSplit(dNX.tcc) if switchStrand: strand = -int(strand) pVals = [] for gName in gNames: #may have to change gene name cuz of multiple spans try: totGeneReads = gName_numReads[gName] numSpotsForGene = gName_numSpots[gName] except KeyError: try: gName = gName + '_RE_%s_%s' % (chrom, strand) totGeneReads = gName_numReads[gName] numSpotsForGene = gName_numSpots[gName] except KeyError: print "FIX THIS GENE NAME", gName continue #add psuedocount totGeneReads += 1 numSpotsForGene += 1 # not sure whether to do this yet... #check for hidden intron gene overlap try: q = 1.0/numSpotsForGene except ZeroDivisionError: continue #intron gene #add p val pVals.append(binom.sf(readsForPeak, totGeneReads, q)) dNX.pValBin = max(pVals) if pVals else -1.0 dNX.save()
def extendPeakTest(tcc, pRange, minVal, maxAvgNoise, minPeakLength, maxPeakLength, cName): chrom, strand, peakPosition, end = cg.tccSplit(tcc) cProfile = stepVectorScan.profileAroundPoint(tcc, pRange, cName, ratio = True) #extend this peak left and right leftRange = range(1-pRange, 0) rightRange = range(1, pRange) leftRange.reverse() #going from the middle outward #left startFinal = leftRange[-1] for i in leftRange: if cProfile[i] > minVal: print ' extending stretch' else: print ' end of stretch L' startFinal = i + 1 break #right endFinal = rightRange[-1] for i in rightRange: if cProfile[i] > minVal: print ' extending stretch' else: print ' end of stretch R' endFinal = i - 1 break peakLength = endFinal - startFinal + 1 #avg expression around peak check... #get total expression before peak low = startFinal high = endFinal noiseExpression = 0 lowRange = range(1 - pRange, low) highRange = range(high + 1, pRange) totalLength = len(lowRange) + len(highRange) print totalLength, pRange, low, high, lowRange, highRange for i in lowRange: noiseExpression += cProfile[i] for i in highRange: noiseExpression += cProfile[i] try: avgNoise = noiseExpression/float(totalLength) except: return False #filter out peaks that look a certain way. if (minPeakLength < peakLength < maxPeakLength) and (avgNoise < maxAvgNoise): goodTcc = cg.makeTcc(chrom, strand, peakPosition + startFinal, peakPosition + endFinal) print '*KEEPER' return goodTcc else: return False
def updateTypeAlignment(oFN, wigDir, chrom, strand, rn = None, tn = None): '''This is for ALIGNMENTS...NOT DEG PEAKS!''' oNX = cgNexusFlat.Nexus(oFN, cgAlignmentFlat.cgAlignment) oNX.load(['tTcc', 'type'], [rn, tn]) if strand == '1': strand = '-1' else: strand = '1' print 'loading wig' coord_types = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'tType') print 'done loading' domOrder = ['microRNA_noncoding', 'lincRNA_noncoding', 'longNC_noncoding', 'miRNA_pseudogene_noncoding', 'Mt_rRNA_noncoding', 'Mt_tRNA_noncoding', 'Mt_tRNA_pseudogene_noncoding', 'rRNA_noncoding', 'rRNA_pseudogene_noncoding', 'scRNA_pseudogene_noncoding', 'snoRNA_noncoding', 'snoRNA_pseudogene_noncoding', 'snRNA_noncoding', 'snRNA_pseudogene_noncoding', 'tRNA_pseudogene_noncoding', 'pseudogene_noncoding', 'protein_coding', 'None'] ds = bioLibCG.dominantSpotter(domOrder) for oID in oNX.tTcc: oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tTcc[oID]) #deg wigs is AS to actual clipping site if oStrand == '1': oStrand = '-1' else: oStrand = '1' if oChrom == chrom and oStrand == strand: tranTypes = coord_types.get(start, 'None').split(',') types = [x.split(':')[1] if x != 'None' else 'None' for x in tranTypes] types = list(set(types)) oNX.type[oID] = ds.spotItem(types) oNX.save()
def countWithBinsSetReads(readFN, binDir, type='INTRON'): numBins = 100 #create bin sets c_s_bin_set = {} for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): #initialize data structure for i in range(0, numBins): c_s_bin_set.setdefault(chrom, {}).setdefault(strand, {})[i] = set() f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r') for line in f: ls = line.strip().split('\t') tccs = ls[1:numBins + 1] for i in range(0, numBins): ch, st, sta, end = bioLibCG.tccSplit(tccs[i]) for j in range(sta, end + 1): c_s_bin_set[chrom][strand][i].add(j) print 'creating read sets' #creat read set c_s_set = {} for chrom in bioLibCG.humanChromosomes: c_s_set[chrom] = {} for strand in ('1', '-1'): c_s_set[chrom][strand] = set() f = open(readFN, 'r') for line in f: ls = line.strip().split('\t') strand, chrom, start = ls[1:4] if chrom not in bioLibCG.humanChromosomes: continue start = int(start) if strand == '+': strand = '-1' start += 20 else: strand = '1' c_s_set[chrom][strand].add(start) print 'counting' #make bCounts binCounts = [0] * numBins #count for each bin for i in range(0, numBins): for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): for j in c_s_set[chrom][strand]: if j in c_s_bin_set[chrom][strand][i]: binCounts[i] += 1 print '%s\t%s' % (i, binCounts[i])
def countWithBinsSetReads(readFN, binDir, type = 'INTRON'): numBins = 100 #create bin sets c_s_bin_set = {} for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): #initialize data structure for i in range(0, numBins): c_s_bin_set.setdefault(chrom, {}).setdefault(strand, {})[i] = set() f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r') for line in f: ls = line.strip().split('\t') tccs = ls[1:numBins + 1] for i in range(0,numBins): ch, st, sta, end = bioLibCG.tccSplit(tccs[i]) for j in range(sta, end + 1): c_s_bin_set[chrom][strand][i].add(j) print 'creating read sets' #creat read set c_s_set = {} for chrom in bioLibCG.humanChromosomes: c_s_set[chrom] = {} for strand in ('1', '-1'): c_s_set[chrom][strand] = set() f = open(readFN, 'r') for line in f: ls = line.strip().split('\t') strand, chrom, start = ls[1:4] if chrom not in bioLibCG.humanChromosomes: continue start = int(start) if strand == '+': strand = '-1' start += 20 else: strand = '1' c_s_set[chrom][strand].add(start) print 'counting' #make bCounts binCounts = [0] * numBins #count for each bin for i in range(0, numBins): for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): for j in c_s_set[chrom][strand]: if j in c_s_bin_set[chrom][strand][i]: binCounts[i] += 1 print '%s\t%s' % (i, binCounts[i])
def test(tcc, wigDir): chrom, strand, start, end = bioLibCG.tccSplit(tcc) print chrom, strand coord_eLevel = cgWig.loadSingleWig(wigDir, chrom, strand, 'ALL') sKeys = sorted(coord_eLevel.keys()) for i in range(start, end + 1): print i, coord_eLevel.get(i, 0)
def getExpressionProfile(tcc, wigDict): '''assume 1 based''' chrom, strand, start, end = bioLibCG.tccSplit(tcc) coord_value = {} for i in range(start, end + 1): coord_value[i] = wigDict[chrom][strand].get(i, 0) return coord_value
def addOne(fN): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[0]) start += 1 end += 1 print bioLibCG.makeTcc(chrom,strand,start,end)
def mapStartRangeCheckFunction(val, line): lineStart = int(line.strip().split('\t')[3]) lineEnd = lineStart + len(line.strip().split('\t')[4]) chrom, strand, start, end = cg.tccSplit(val) start = int(start) end = int(end) if cg.simpleOverlap(start, end, lineStart, lineEnd): return 0 else: return -1
def mapStartCheckFunction(val, line): lineStart = int(line.strip().split('\t')[3]) chrom, strand, start, end = cg.tccSplit(val) start = int(start) if start < lineStart: return -1 elif start > lineStart: return 1 elif start == lineStart: return 0
def makeWigMem(fN, assembly, format=None, name=None, directory=None): '''format assumes bowtie suitible for small mapped files.''' if not name: name = cg.getBaseFileName(fN, naked=True) if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) lDict = cg.returnChromLengthDict(assembly) f = open(fN, 'r') f.readline() #header...file might not have one but its one read... #create hitmap of chrom and strand hitDict = {} #format = chr: { strand : { coord : value for line in f: try: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) except AttributeError: continue lStrand = str(lStrand) start = int(start) end = int(end) if lChrom in cg.acceptableChroms: #wig for degradome if lStrand == '1': i = start + 20 else: i = start try: hitDict[lChrom][lStrand][i] += 1 except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = 1 ''' for i in range(start, end): try: hitDict[lChrom][lStrand][i] += 1 except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = 1 ''' f.close() #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def svCoord(tccList, config = None): '''Given tcc list --> scan Organism wig files and coord:value... ''' #init config = c.getConfig(config) org = config.conf['organism'] wigDir = config.conf['wigSetDir'] wigSetName = config.conf['wigSetName'] splitIntoChroms = config.conf['wigChromSplit'] if splitIntoChroms == 'True': splitIntoChroms = True else: splitIntoChroms = False coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) if splitIntoChroms: fN = wigDir + '/%s.%s.%s.wig' % (wigSetName, chrom, strand) else: fN = wigDir + '/Merge.%s.%s.wig' % (org.lower(), strand) fIndex = cgIndex.lineIndex(fN, header = True) fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) + 1 #print 'lBeg', lBeg lEnd = int(cg.ss(line)[2]) #print 'lEnd', lEnd #print '--' lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd + 1): coordDict[i] = lValue if stop: break fIndex.close() #close the file and the index after use... return coordDict
def peakToSeq(peakFN, extend, outFN): #extend is +25 for degradome and -6/-4 for oRNA extend = int(extend) gf = GenomeFetch.GenomeFetch('hg19') outF = open(outFN, 'w') f = open(peakFN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[0]) start, end = start - extend, end + extend newTcc = bioLibCG.makeTcc(chrom, strand, start, end) outF.write(gf.getSequence(newTcc) + '\n')