def updateAvgNumSS(oFN): bn = os.path.basename(oFN) print 'basename', bn oID_numSS = {} numSims = 100 print 'getting avg for %s simulations' % numSims for i in range(0, numSims): #simFN = '/home/chrisgre/scripts/simulations/simsk50FilteredMasked/simulation.%s/%s' % (i, bn) #simFN = '/home/chrisgre/scripts/simulations/simsk50Fix/simulation.%s/%s' % (i, bn) #simFN = '/home/chrisgre/scripts/simulations/mm9/simulation.%s/%s' % (i, bn) #simFN = '/home/chrisgre/scripts/simulations/hg19.hela/simulation.%s/%s' % (i, bn) simFN = '/home/chrisgre/scripts/simulations/hg19.U87/simulation.%s/%s' % ( i, bn) osNX = cgNexusFlat.Nexus(simFN, cgOriginRNAFlat.OriginRNA) osNX.load(['numSignificantSequences']) for oID in osNX.numSignificantSequences: oID_numSS[oID] = oID_numSS.get( oID, 0) + osNX.numSignificantSequences[oID] #now save it oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['avgNumSS']) for oID in oNX.avgNumSS: totalNum = oID_numSS.get(oID, 0) avgNum = float(totalNum) / float(numSims) oNX.avgNumSS[oID] = avgNum oNX.save()
def updateTargetIDsFiltered(oFN, aFN, rn=None, tn=None): '''CAUTION: NO SELECTION BEING MADE!!!''' #load the data oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['filteredTargets'], [rn, tn]) ''' #get ids of alignments I need (set) oIDs = set() for oID in oNX.filteredTargets: oIDs.add(oID) ''' #load only alignments I need '''c = {'sID' : lambda x: x in oIDs}''' aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['sID']) #clear targets that are there. for oID in oNX.filteredTargets: oNX.filteredTargets[oID] = [] #update the targets for oRNAs for aID in aNX.sID: oID = aNX.sID[aID] try: oNX.filteredTargets[oID].append(aID) except KeyError: #another process is taking care of this one pass #save oNX.save()
def appendTInfoFlat(aFN, dFN, rn = None, tn = None): aNX = cgNexusFlat.Nexus(aFN, cgAlignment) aNX.load(['tID', 'tTcc', 'transcriptOverlap', 'tELevel', 'context', 'repeat', 'targetSequence', 'gScore'], [rn, tn]) dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak) dNX.load(['tOverlap', 'eLevel', 'tcc', 'context', 'repeatStatus', 'sequence', 'gScore']) tID_aIDs = {} for aID in aNX.tID: tID_aIDs.setdefault(aNX.tID[aID], []).append(aID) for dID in dNX.tcc: for aID in tID_aIDs.get(dID, list()): aNX.tTcc[aID] = dNX.tcc[dID] aNX.tELevel[aID] = dNX.eLevel[dID] aNX.transcriptOverlap[aID] = dNX.tOverlap[dID] aNX.context[aID] = dNX.context[dID] aNX.repeat[aID] = dNX.repeatStatus[dID] aNX.gScore[aID] = dNX.gScore[dID] aNX.targetSequence[aID] = dNX.sequence[dID] #aNX.repeatCount[aID] = dNX.repeatCount[dID] #aNX.totalContig[aID] = dNX.totalContig[dID] aNX.save()
def updateAvgNumTargets(oFN): bn = os.path.basename(oFN) print 'basename', bn oID_numTargets = {} for i in range(0, 10): #simFN = '/home/chrisgre/scripts/simulations/simsk50FilteredMasked/simulation.%s/%s' % (i, bn) simFN = '/home/chrisgre/scripts/simulations/simsk50/simulation.%s/%s' % ( i, bn) print simFN osNX = cgNexusFlat.Nexus(simFN, cgOriginRNAFlat.OriginRNA) osNX.load(['filteredTargets']) for oID in osNX.filteredTargets: currTargets = oID_numTargets.get(oID, 0) oID_numTargets[oID] = currTargets + len(osNX.filteredTargets[oID]) #now save it oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['avgNumSimulationTargets']) for oID in oNX.avgNumSimulationTargets: totalNum = oID_numTargets.get(oID, 0) avgNum = float(totalNum) / float(10.0) oNX.avgNumSimulationTargets[oID] = avgNum oNX.save()
def eLevelHistogram(oFN, aFN, oRNA=True): oRNA = 'True' in oRNA oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['filteredTargets', 'eLevel']) aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['tELevel']) histValues = [] for oID in oNX.eLevel: if oRNA: histValues.append(oNX.eLevel[oID]) else: for aID in oNX.filteredTargets[oID]: histValues.append(aNX.tELevel[aID]) histVals = [math.log(x, 10) for x in histValues] plt.hist(histVals, 50) type = 'oRNA' if not oRNA: type = 'Targets (degradome)' plt.title('Expression Level for %s' % type) plt.xlabel('log(Expression Level)') plt.ylabel('Number of %s' % type) plt.show()
def countRepeatStatusTargets(oFN, aFN, oContext=None, oType=None): if oContext == 'None': oContext = None if oType == 'None': oType = None oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load( ['snrSS', 'context', 'transcriptType', 'filteredTargets', 'gScore'], [rn, tn]) aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['context', 'gScore', 'repeat'], [rn, tn]) context_rStatuss = {} for oID in oNX.context: if oNX.snrSS[oID] < 2.00: continue #gather targets' context info if oRNA is okay for aID in oNX.filteredTargets[oID]: aCon = aNX.context[aID] rStatus = aNX.repeat[aID] context_rStatuss.setdefault(aCon, []).append(rStatus) #plot for context in context_rStatuss: print context, context_rStatuss[context].count( True), context_rStatuss[context].count(False)
def gZipContextECDF(oFN, aFN, imgName, oContext=None, oType=None): if oContext == 'None': oContext = None if oType == 'None': oType = None oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load( ['snrSS', 'context', 'transcriptType', 'filteredTargets', 'gScore'], [rn, tn]) aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['context', 'gScore'], [rn, tn]) context_gzips = {} for oID in oNX.context: if oNX.snrSS[oID] < 2.00: continue #gather targets' context info if oRNA is okay for aID in oNX.filteredTargets[oID]: aCon = aNX.context[aID] gScore = aNX.gScore[aID] context_gzips.setdefault(aCon, []).append(gScore) #plot for context in context_gzips: plt.hist(context_gzips[context], bins=10000, cumulative=True, histtype='step', normed=True, label='%s' % context) plt.legend() plt.savefig(imgName, bbox_inches='tight', pad_inches=1)
def updateTargetIDs(oFN, aFN, rn=None, tn=None): #load the data oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['targets'], [rn, tn]) #get ids of alignments I need (set) oIDs = set() for oID in oNX.targets: oIDs.add(oID) #load only alignments I need c = {'sID': lambda x: x in oIDs} aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['sID'], conditions=c) #clear targets that are there. for oID in oNX.targets: oNX.targets[oID] = [] #update the targets for oRNAs for aID in aNX.sID: oID = aNX.sID[aID] oNX.targets[oID].append(aID) #save oNX.save()
def updatePolySeqs(mFN, readsFN, alignFN): tim = bioLibCG.cgTimer() tim.start() variousAs = ["A" * x for x in range(1,20)] variousGs = ["G" * x for x in range(1,20)] variousTs = ["T" * x for x in range(1,20)] variousCs = ["C" * x for x in range(1,20)] letter_variousLetters = [ ("A", variousAs), ("G", variousGs), ("T", variousTs), ("C", variousCs)] checkRange = range(1,8) NX = cgNexusFlat.Nexus(mFN, miR) NX.load(['sequence', 'polySeqs']) #print 'load micro', tim.split() reads = cgNexusFlat.quickTable(('read','string', '.', 1)) rNX = cgNexusFlat.Nexus(readsFN, reads) rNX.load(['read']) #print 'load reads', tim.split() aNX = cgNexusFlat.Nexus(alignFN, cgAlignment) aNX.load(['sID', 'tID']) #print 'load alignments', tim.split() for id in aNX.ids: theRead = rNX.read[aNX.sID[id]] mID = aNX.tID[id] microSeq = NX.sequence[mID] #may be a read for expression, but wont count... if theRead in microSeq: continue #just for expression if microSeq == theRead: print tabIt(microSeq, theRead, 0, 0, "N") #first check full elif microSeq in theRead and (len(theRead) != len(microSeq)): tail = theRead.split(microSeq)[1] for let, variousLetters in letter_variousLetters: if tail in variousLetters: print tabIt(microSeq, theRead, 0, len(tail), let) #now check trimmed (cant do [:-0]) else: for i in checkRange: if microSeq[:-i] in theRead and (len(theRead) != len(microSeq[:-i])): tail = theRead.split(microSeq[:-i])[1] for let, variousLetters in letter_variousLetters: if tail in variousLetters: print tabIt(microSeq, theRead, i, len(tail), let) print "TRIMMED" break #dont trim after the first trimmed one works
def targetContextPercentageVsExpression(oFN, aFN, oContext=None, oType=None, rn=None, tn=None): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['context', 'transcriptType', 'filteredTargets']) aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['context', 'tELevel']) context_level_count = {} for lev in range(50, 500, 50): for oID in oNX.context: #filter oID types here con = oNX.context[oID] typ = oNX.transcriptType[oID] if oContext: if oContext != con: continue if oType: if oType != typ: continue #gather targets' context info if oRNA is okay for aID in oNX.filteredTargets[oID]: eLevel = aNX.tELevel[aID] if eLevel < lev: continue aCon = aNX.context[aID] context_level_count[aCon][ lev] = context_level_count.setdefault(aCon, {}).get( lev, 0) + 1 #fracs = pieFractions(counts) plots_labels = [[], []] for con in context_level_count: x = [] y = [] sortedLevs = sorted(context_level_count[con].keys()) for lev in sortedLevs: x.append(lev) y.append(context_level_count[con][lev]) plots_labels[0].append(plt.plot( x, y, )) plots_labels[1].append(con) #plot plt.legend(plots_labels[0], plots_labels[1]) plt.title( 'oRNA Targets\' Context Proportion Stability w/ Expression Increase') plt.xlabel('Degradome Expression Cutoff') plt.ylabel('Number of oRNA Targets') plt.show()
def phastScoreByNT(oFN, aFN, oIDFilter=None): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['phastScores', 'snrSS', 'filteredTargets']) aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['mismatchPositions']) misPositionsX = [] misScoresY = [] positionsX = [] scoresY = [] for oID in oNX.phastScores: avgScore = sum(oNX.phastScores[oID]) / float(len(oNX.phastScores[oID])) #filter if (avgScore < .90) or (oNX.snrSS[oID] < 2): continue if oIDFilter: if oID != int(oIDFilter): continue misPositions = set() #get consolidated mismatches for aID in oNX.filteredTargets[oID]: for mPos in aNX.mismatchPositions[aID]: misPositions.add(mPos) for i, pScore in enumerate(oNX.phastScores[oID]): if i in misPositions: misPositionsX.append( i + 1.2 ) #1 is for 0BASE, .2 is for differentiating between mis and reg misScoresY.append(pScore) else: positionsX.append(i + 1) scoresY.append(pScore) highestNT = max(positionsX) for i in range(1, highestNT + 1): plt.axvspan(i - .15, i + .35, facecolor='g', alpha=.25) plt.plot(positionsX, scoresY, 'bo') plt.plot(misPositionsX, misScoresY, 'ro') plt.ylim(0, 1.1) plt.xlim(0, 24) plt.title('Conservation by Position of Conserved oRNA') plt.ylabel('PhastCons Score') plt.xlabel('Nucleotide Position') plt.show()
def filterTargets(oFN, aFN, inTranscript, misLevel, centerLevel, minCenterLevel, rn=None, tn=None): if inTranscript == 'True': inTranscript = True if inTranscript == 'False': inTranscript = False misLevel, centerLevel, minCenterLevel = int(misLevel), int( centerLevel), float(minCenterLevel) oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['filteredTargets', 'targets'], [rn, tn]) print inTranscript, misLevel, centerLevel, minCenterLevel, oFN, aFN, rn, tn #make selection set targets = set() for oID in oNX.targets: for target in oNX.targets[oID]: targets.add(target) c = {'ID': lambda x: x in targets} aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['transcriptOverlap', 'mismatchStatus', 'centerExpression'], conditions=c) for oID in oNX.filteredTargets: oNX.filteredTargets[oID] = [] for aID in oNX.targets[oID]: #transcriptOverlap if inTranscript: if not aNX.transcriptOverlap[aID]: #print 'tOverlap Fail', cgAlignment.pretty#print(alignment) continue #misLevel if aNX.mismatchStatus[aID][misLevel]: #print 'mismatch Fail', cgAlignment.pretty#print(alignment) continue #centerLevel if aNX.centerExpression[aID][centerLevel] < minCenterLevel: #print 'expression Fail', cgAlignment.pretty#print(alignment) continue oNX.filteredTargets[oID].append(aID) oNX.save()
def conservedHisto(oFN): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['phastScores']) scores = [] for oID in oNX.phastScores: avgScore = sum(oNX.phastScores[oID]) / float(len(oNX.phastScores[oID])) #scores.extend(oNX.phastScores[oID]) scores.append(avgScore) print len(scores) plt.title('PhastCons Scores By Nucleotide') plt.title('PhastCons Scores By oRNA') plt.ylabel('Number of Nucleotides') plt.ylabel('Number of oRNA') plt.xlabel('PhastCons Score') plt.xlabel('PhastCons Score Average') plt.hist(scores, 50) plt.show()
def updateTranscriptOverlap(oFN, wigDir, chrom, strand, rn=None, tn=None): oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak) oNX.load(['tOverlap', 'tcc'], [rn, tn]) #load the AS wig file for this degradome strand if strand == '1': strand = '-1' else: strand = '1' coord_transcripts = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, 'transcript') for oID in oNX.tOverlap: tChrom, tStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID]) if tStrand == '1': tStrand = '-1' else: tStrand = '1' if tChrom != chrom or tStrand != strand: continue oNX.tOverlap[oID] = False for i in xrange(start, end + 1): if i in coord_transcripts: oNX.tOverlap[oID] = True break oNX.save()
def markMismatchedPairs(aFN, rn=None, tn=None): #make mismatchDict aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['mismatchStatus', 'mismatchPositions'], [rn, tn]) lowRange = range( 8, 12) # remember the small locations are 0-based, so 10 is 9 midRange = range(7, 13) highRange = range(6, 14) for aID in aNX.mismatchStatus: aNX.mismatchStatus[aID] = [False, False, False] #check mismatches for i in lowRange: if i in aNX.mismatchPositions[aID]: aNX.mismatchStatus[aID][0] = True break for i in midRange: if i in aNX.mismatchPositions[aID]: aNX.mismatchStatus[aID][1] = True break for i in highRange: if i in aNX.mismatchPositions[aID]: aNX.mismatchStatus[aID][2] = True break aNX.save()
def getGeneGC(genePropFN, rn=None, tn=None): myGF = gf.GenomeFetch('hg19') NX = cgNexusFlat.Nexus(genePropFN, geneProperty) NX.load([ 'geneName', 'geneChrom', 'geneStrand', 'geneStarts', 'geneEnds', 'geneGCContent' ], [rn, tn]) for id in NX.ids: spanPairs = zip(NX.geneStarts[id], NX.geneEnds[id]) spanTccs = [ '%s:%s:%s:%s' % (NX.geneChrom[id], NX.geneStrand[id], pair[0], pair[1]) for pair in spanPairs ] totalSequenceLength = 0 numGC = 0 for tcc in spanTccs: seq = myGF.getSequence(tcc) totalSequenceLength += len(seq) numGC += seq.count('G') + seq.count('C') GCC = float(numGC) / totalSequenceLength NX.geneGCContent[id] = GCC NX.save()
def updateGeneData(geneRanges, genePropFN): myGF = gf.GenomeFetch('hg19') NX = cgNexusFlat.Nexus(genePropFN, geneProperty) NX.load(['geneName', 'geneChrom', 'geneStrand', 'geneStarts', 'geneEnds']) #make inverse dictionary gName_nID = {} for id in NX.ids: gName_nID[NX.geneName[id]] = id f = open(geneRanges, 'r') for line in f: ls = line.strip().split('\t') sChrom, sStrand = ls[1], ls[2] geneName = ls[0] geneStarts = [int(x) for x in ls[3].split(',')] geneEnds = [int(x) for x in ls[4].split(',')] #get id nID = gName_nID.get(geneName, None) if nID: NX.geneChrom[nID] = sChrom NX.geneStrand[nID] = sStrand NX.geneStarts[nID] = geneStarts NX.geneEnds[nID] = geneEnds f.close() NX.save()
def filterOrigin(oFN, rn=None, tn=None): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load([ 'filteredTargets', 'endContigLength', 'totalContigLength', 'sequenceDuplicate', 'passedFilter', 'entropy' ], [rn, tn]) for oID in oNX.entropy: oNX.passedFilter[oID] = False if oNX.entropy[oID] < 1.15: continue if oNX.endContigLength[oID] > 6: continue if oNX.totalContigLength[oID] > 6: continue if oNX.sequenceDuplicate[oID]: continue if not oNX.filteredTargets[oID]: continue oNX.passedFilter[oID] = True oNX.save()
def getGeneLength(geneRanges, genePropFN): NX = cgNexusFlat.Nexus(genePropFN, geneProperty) NX.load(['geneName', 'geneLength']) #make inverse dictionary gName_nID = {} for id in NX.ids: gName_nID[NX.geneName[id]] = id f = open(geneRanges, 'r') for line in f: ls = line.strip().split('\t') sChrom, sStrand = ls[1], ls[2] geneName = ls[0] geneStarts = [int(x) for x in ls[3].split(',')] geneEnds = [int(x) for x in ls[4].split(',')] spanPairs = zip(geneStarts, geneEnds) totalLength = sum([pair[1] - pair[0] for pair in spanPairs]) #get id nID = gName_nID.get(geneName, None) if nID: NX.geneLength[nID] = totalLength f.close() NX.save()
def totalSNRSS(oFN, SNRToggle=False): if SNRToggle == 'True': SNRToggle = True else: SNRToggle = False oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['snrSS', 'numSignificantSequences', 'avgNumSS']) highSNRs = [] for oID in oNX.snrSS: snrSS = oNX.snrSS[oID] if snrSS > 2: highSNRs.append(snrSS) if SNRToggle: try: avgSNR = sum(highSNRs) / len(highSNRs) print avgSNR, except: print '0.0', else: print len(highSNRs),
def totalSNR(oFN): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load( ['snr', 'filteredTargets', 'avgNumSimulationTargets', 'passedFilter']) totalRun = 0 totalSim = 0 simsTotals = [] n = 0 highSNR = 0 for oID in oNX.avgNumSimulationTargets: #filter out if not oNX.passedFilter[oID]: continue #collect stats totalRun += len(oNX.filteredTargets[oID]) simsTotal = oNX.avgNumSimulationTargets[oID] * 10 simsTotals.append(simsTotal) totalSim += oNX.avgNumSimulationTargets[oID] if oNX.snr[oID] > 2: highSNR += 1 n += 1 print oFN print 'Total Number Targets for my run:', totalRun print 'Total Number Targets for Simulations:', totalSim print 'SNR', float(totalRun) / float(totalSim) print 'Total oRNA:', n, 'Total oRNA w/ SNR > 2:', highSNR print '\n'
def updateFiltered(oFN): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load([ 'filteredTargets', 'endContigLength', 'totalContigLength', 'entropy', 'sequenceDuplicate', 'passedFilter' ]) for oID in oNX.passedFilter: oNX.passedFilter[oID] = False if len(oNX.filteredTargets[oID]) == 0: continue if oNX.endContigLength[oID] > 6: continue if oNX.totalContigLength[oID] > 6: continue if oNX.entropy[oID] < 1.2: continue if oNX.sequenceDuplicate[oID]: continue #if it passed, update oNX.passedFilter[oID] = True oNX.save()
def correlationSNR(oFN): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['phastScores', 'snrSS']) snrX = [] scoreY = [] for oID in oNX.phastScores: snr = oNX.snrSS[oID] avgScore = sum(oNX.phastScores[oID]) / float(len(oNX.phastScores[oID])) snrX.append(snr) scoreY.append(avgScore) ''' for pScore in oNX.phastScores[oID]: snrX.append(snr) scoreY.append(pScore) ''' conserved = [True if x > .8 else False for x in scoreY] print conserved.count(True) print len(conserved) plt.title('SNR vs PhastCons Score') plt.ylabel('Avg PhastCons Score of oRNA') plt.xlabel('SNR') plt.plot(snrX, scoreY, 'ro') plt.show()
def oRNAContextPie(oFN, imgName): '''REMEMBER!!! Have to do with grouped results...''' oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['context', 'snrSS']) context_count = {} for oID in oNX.context: if oNX.snrSS[oID] < 2.00: continue con = oNX.context[oID] print con context_count[con] = context_count.get(con, 0) + 1 labels = sorted(context_count.keys()) counts = [context_count[x] for x in labels] fracs = pieFractions(counts) #add numbers to labels labels = ['%s (%s)' % (x, context_count[x]) for x in labels] #plot plt.title('Context of oRNA (results > 2.00 SNR)') plt.pie(fracs, labels=labels, shadow=True) plt.savefig(imgName, bbox_inches='tight', pad_inches=1)
def updateEndContig(oFN, rn = None, tn = None): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['sequence', 'endContigLength'], [rn, tn]) for oID in oNX.sequence: seq = oNX.sequence[oID] #5' cLength5 = 1 for i,letter in enumerate(seq): if i == 0: continue if seq[i] == seq[i-1]: cLength5 += 1 else: break #3' cLength = 1 revSeq = [x for x in reversed(seq)] for i,letter in enumerate(revSeq): if i == 0: continue if revSeq[i] == revSeq[i-1]: cLength += 1 else: break highest = cLength5 if cLength > cLength5: highest = cLength oNX.endContigLength[oID] = highest oNX.save()
def loadSeqs(seqFN): seqNX = cgNexusFlat.Nexus(seqFN, Seq) seqNX.load(['length', 'sequence']) print seqNX.length[100000], seqNX.sequence[100000] print 'done loading'
def markMismatchedPairs(aFN, rn=None, tn=None): #make mismatchDict aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['mismatchStatus', 'mismatchPositions'], [rn, tn]) for aID in aNX.mismatchStatus: aNX.mismatchStatus[aID] = [False, False, False] lowRange = range(9, 13) midRange = range(8, 14) highRange = range(7, 15) #check mismatches for i in lowRange: if i in aNX.mismatchPositions[aID]: aNX.mismatchStatus[aID][0] = True break for i in midRange: if i in aNX.mismatchPositions[aID]: aNX.mismatchStatus[aID][1] = True break for i in highRange: if i in aNX.mismatchPositions: aNX.mismatchStatus[aID][2] = True break aNX.save()
def updateSharedTargets(oFN, rn=None, tn=None): '''Just because there are duplicate sequences does not mean that the genomic position of each results is the correct one. The targets for each genomic position should be the same as the targets for each duplicate sequence make set of targets for each oID --> set each oid's targets''' oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['sequence', 'filteredTargets'], [rn, tn]) knownSeq_targets = {} #create oID groups and target sets. for oID in oNX.sequence: currSeq = oNX.sequence[oID] #add targets to set for tID in oNX.filteredTargets[oID]: knownSeq_targets.setdefault(currSeq, set()).add(tID) for oID in oNX.sequence: currSeq = oNX.sequence[oID] newTargets = list(knownSeq_targets.get(currSeq, set())) oNX.filteredTargets[oID] = newTargets oNX.save()
def oRNATypePie(oFN, imgName): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['snrSS', 'transcriptType', 'transcriptTypes'], [rn, tn]) context_count = {} for oID in oNX.transcriptType: #con = oNX.transcriptType[oID] if oNX.snrSS[oID] < 2.00: continue cons = oNX.transcriptTypes[oID] for con in cons: context_count[con] = context_count.get(con, 0) + 1 #context_count[con] = context_count.get(con, 0) + 1 labels = sorted(context_count.keys()) counts = [context_count[x] for x in labels] fracs = pieFractions(counts) #add numbers to labels labels = ['%s (%s)' % (x, context_count[x]) for x in labels] #plot plt.pie(fracs, labels=labels, shadow=True) plt.savefig(imgName, bbox_inches='tight', pad_inches=1)
def updateContext(oFN, wigDir, chrom, strand, rn=None, tn=None): oNX = cgNexusFlat.Nexus(oFN, degPeak.degPeak) oNX.load(['context', 'tcc'], [rn, tn]) print 'loading wig' coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') print 'done loading' ds = bioLibCG.dominantSpotter([ 'C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER' ]) for oID in oNX.tcc: oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID]) #deg wigs is AS to actual clipping site if oStrand == '1': oStrand = '-1' else: oStrand = '1' if oChrom == chrom and oStrand == strand: contexts = coord_contexts.get(start, 'INTER').split(',') oNX.context[oID] = ds.spotItem(contexts) oNX.save()