def testOverlaps(dataFN, oFF): dataNX = Nexus(dataFN, oFF) dataNX.load(['tcc']) #check for overlaps overlappingIDs = set() chrom_strand_range = {} while dataNX.nextID(): chrom, strand, start, end = bioLibCG.tccSplit(dataNX.tcc) #check if overlap chrom_strand_range.setdefault(chrom, {}).setdefault(strand, set()) overlap = False for i in range(start, end + 1): if i in chrom_strand_range[chrom][strand]: overlap = True break #tag or add these coordinates if overlap: overlappingIDs.add(dataNX.id) else: for i in range(start, end + 1): chrom_strand_range[chrom][strand].add(i) print "THESE OVERLAP", overlappingIDs
def check_ORNA_in_ago(oFN, oFF, agoFN, clippingAmount = 1): NX = Nexus(oFN, oFF) NX.load(['sequence', 'geneNames']) #make truncated sequences id_sequence = NX.createMap('id', 'sequence') if clippingAmount > 0: id_sequence = dict( (i, j[clippingAmount:-clippingAmount]) for i,j in id_sequence.items()) #get fastq sequences agoF = open(agoFN, 'r') agoSeqs = [] while True: fPacket = nextFilePacket(agoF, 4) if not fPacket: break agoSeqs.append(fPacket[1]) agoF.close() #count for each oRNA id_count = {} for id, seq in id_sequence.items(): for agoSeq in agoSeqs: if seq in agoSeq: id_count[id] = id_count.get(id, 0) + 1 #out totalCount = 0 for id, count in id_count.items(): NX.id = id print '%s\t%s\t%s' % (id, count, NX.geneNames) totalCount += count print totalCount
def testMap(fN, fF): NX = Nexus(fN, fF) NX.load(['geneName', 'numReads', 'otherIDs']) geneName_numReads = NX.createMap('otherIDs', 'geneName', False) #not 1to1 for k,v in geneName_numReads.iteritems(): print k, v[:5] return
def updateGeneName(dFN, fFN, wigDir, chrom, strand, prefix, switchStrand=False): NX = Nexus(dFN, fFN) NX.load(['geneNames', 'tcc']) if switchStrand: strand = -strand strand = str(strand) coord_gName = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, prefix) while NX.nextID(): chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc) overlappingGenes = coord_gName.get(start, ".") if overlappingGenes == "NONE": NX.geneNames = [] else: NX.geneNames = overlappingGenes.split(',') NX.save()
def testNX(fN, fF): NX = Nexus(fN, fF, 'geneName numReads isCoding otherIDs') print 'START LOOPING' for gene in NX: gene.isCoding = True gene.otherIDs = range(10) gene.geneName = "testAuto" gene.numReads = 300 NX.save()
def getTotalSpots(allFN, formatFN): NX = Nexus(allFN, formatFN) NX.load(['numNSpots', 'numReads']) totalSpots = 0 totalReads = 0 while NX.nextID(): totalSpots += NX.numNSpots totalReads += NX.numReads print 'spotsNSpots', totalSpots print 'numReads', totalReads
def updateContext(fN, fF, wigDir, chrom, strand, switchStrand = False): NX = Nexus(fN, fF) NX.load(['tcc', 'context']) if switchStrand: strand = str(-int(strand)) else: strand = str(strand) print 'loading wig' coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') print 'done loading' ds = bioLibCG.dominantSpotter(['C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER']) while NX.nextID(): oChrom, oStrand, start, end = bioLibCG.tccSplit(NX.tcc) #deg wigs is AS to actual clipping site if switchStrand: oStrand = str(-int(strand)) else: oStrand = str(oStrand) if oChrom == chrom and oStrand == strand: contexts = coord_contexts.get(start, 'INTER').split(',') NX.context = ds.spotItem(contexts) NX.save()
def updateSNR(oFN, oFF): NX = Nexus(oFN, oFF) NX.load(['avgNumSimSS', 'numUFBS', 'snr']) while NX.nextID(): try: NX.snr = NX.numUFBS / NX.avgNumSimSS except ZeroDivisionError: NX.snr = NX.numUFBS / 0.01 NX.save()
def updateSimilarSiblings(oFN, oFF, frameLength): dataNX = Nexus(oFN, oFF) dataNX.load(['sequence', 'siblingSet']) oID_sequence = dataNX.createMap('id', 'sequence') consolidatedSets = getSimilarORNASets(oID_sequence, frameLength) for cSet in consolidatedSets: for oID in cSet: dataNX.id = oID dataNX.siblingSet = list(cSet) dataNX.save()
def updateSNR(oFN, oFF): NX = Nexus(oFN, oFF) NX.load(['avgNumSimSS', 'numUFBS', 'snr']) while NX.nextID(): try: NX.snr = NX.numUFBS/NX.avgNumSimSS except ZeroDivisionError: NX.snr = NX.numUFBS/0.01 NX.save()
def updateScores(fN, fFN): NX = Nexus(fN, fFN) NX.load(['numNormMatches', 'numGUs', 'numMismatches', 'numQGaps', 'numRGaps', 'numExtensionsQ','numExtensionsR', 'score']) while NX.nextID(): NX.score = calculateAlignmentScore(NX.numNormMatches, NX.numGUs, NX.numMismatches, NX.numQGaps, NX.numRGaps, NX.numExtensionsQ, NX.numExtensionsR) NX.save()
def filterCenterProperties(fN, fFN): NX = Nexus(fN, fFN) NX.load(['query', 'reference', 'qStart', 'qEnd', 'rStart', 'rEnd', 'qLen', 'rLen', 'sigMask', 'centerPass', 'mismatchPass']) while NX.nextID(): qRange = [NX.qStart, NX.qEnd] rRange = [NX.rStart, NX.rEnd] NX.mismatchPass = checkMismatchCenter(NX.query, NX.reference, NX.qLen, NX.rLen, qRange, rRange, NX.sigMask) NX.centerPass = checkPeakCenter(NX.query, NX.reference, NX.qLen, NX.rLen, qRange, rRange) NX.save()
def testAutoLoad(fN, ff): NX = Nexus(fN, ff) print 'START LOOPING' while NX.nextID(): NX.isCoding = True NX.otherIDs = range(10) NX.geneName = "testAuto" NX.numReads = 300 NX.save()
def check_ORNA_in_ago(oFN, oFF, agoFN, clippingAmount=1): NX = Nexus(oFN, oFF) NX.load(['sequence', 'geneNames']) #make truncated sequences id_sequence = NX.createMap('id', 'sequence') if clippingAmount > 0: id_sequence = dict((i, j[clippingAmount:-clippingAmount]) for i, j in id_sequence.items()) #get fastq sequences agoF = open(agoFN, 'r') agoSeqs = [] while True: fPacket = nextFilePacket(agoF, 4) if not fPacket: break agoSeqs.append(fPacket[1]) agoF.close() #count for each oRNA id_count = {} for id, seq in id_sequence.items(): for agoSeq in agoSeqs: if seq in agoSeq: id_count[id] = id_count.get(id, 0) + 1 #out totalCount = 0 for id, count in id_count.items(): NX.id = id print '%s\t%s\t%s' % (id, count, NX.geneNames) totalCount += count print totalCount
def collectData(fN, fFN, outFN, logHeight = False): '''x(pHeight) v. y(-log10(pval)). 0.0 is 1.0e-100''' NX = Nexus(fN, fFN) NX.load(['eLevel', 'pValBin']) f = open(outFN, 'w') while NX.nextID(): x = math.log(NX.eLevel, 10) if logHeight else NX.eLevel pVal = NX.pValBin if pVal < 0: continue pVal = pVal if (pVal != 0.0) else float("1.0e-100") try: y = -math.log(pVal, 10) except ValueError: print x, pVal return f.write('%s\t%s\n' % (x,y)) f.close()
def testConsolidation(oFN, oFF, frameLength): dataNX = Nexus(oFN, oFF) dataNX.load(['sequence']) oID_sequence = dataNX.createMap('id', 'sequence') consolidatedSets = getSimilarORNASets(oID_sequence, frameLength) #check if all oIDs are in set allConsolidatedIDs = set() [allConsolidatedIDs.add(x) for theSet in consolidatedSets for x in theSet] oIDsSet = set(oID_sequence.keys()) print "DIFFERENCE" print oIDsSet.symmetric_difference(allConsolidatedIDs) #check Duplicates #print out sets to verify that they work for oIDSet in consolidatedSets: print print oIDSet for oID in oIDSet: print oID, oID_sequence[oID]
def cleanForSNR(dataFN, oFF): dataNX = Nexus(dataFN, oFF) dataNX.load(['numUniqueSims', 'numUFBS', 'snrClean', 'siblingSet']) id_numUFBS = dataNX.createMap('id', 'numUFBS') id_siblingSet = dataNX.createMap('id', 'siblingSet') unusedSiblings = [] for id, siblingSet in id_siblingSet.iteritems(): if len(siblingSet) == 1: continue #NOTE: oRNA IDs are in their own sibling set numUFBS__id = [(id_numUFBS[x], x) for x in siblingSet] numUFBS__id.sort() numUFBS__id.pop() #take last one (one we're keeping) out of list unusedIDs = [x[1] for x in numUFBS__id] unusedSiblings.extend(unusedIDs) #tag unclean oRNA while dataNX.nextID(): if (dataNX.id in unusedSiblings) or (dataNX.numUniqueSims < 10): dataNX.snrClean = False else: dataNX.snrClean = True dataNX.save()
def updateSequence(oFN, oFF, extend, assembly): NX = Nexus(oFN, oFF) NX.load(['sequence', 'tcc']) gf = GenomeFetch.GenomeFetch(assembly) while NX.nextID(): chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc) start, end = start - extend, end + extend newTcc = bioLibCG.makeTcc(chrom, strand, start, end) NX.sequence = gf.getSequence(newTcc) NX.save()
def updateELevel2(dFN, dForm, wigDir): '''Dont need to do it by chromosome because it is small enough''' '''Also dont need to flip the strand because the wig is opposite as well''' NX = Nexus(dFN, dForm) NX.load(['tcc', 'eLevel']) wigDict = cgWig.loadWigDictFloat(wigDir) while NX.nextID(): coord_value = cgWig.getExpressionProfile(NX.tcc, wigDict) NX.eLevel = max(coord_value.values()) NX.save()
def updateScores(fN, fFN): NX = Nexus(fN, fFN) NX.load([ 'numNormMatches', 'numGUs', 'numMismatches', 'numQGaps', 'numRGaps', 'numExtensionsQ', 'numExtensionsR', 'score' ]) while NX.nextID(): NX.score = calculateAlignmentScore(NX.numNormMatches, NX.numGUs, NX.numMismatches, NX.numQGaps, NX.numRGaps, NX.numExtensionsQ, NX.numExtensionsR) NX.save()
def filterCenterProperties(fN, fFN): NX = Nexus(fN, fFN) NX.load([ 'query', 'reference', 'qStart', 'qEnd', 'rStart', 'rEnd', 'qLen', 'rLen', 'sigMask', 'centerPass', 'mismatchPass' ]) while NX.nextID(): qRange = [NX.qStart, NX.qEnd] rRange = [NX.rStart, NX.rEnd] NX.mismatchPass = checkMismatchCenter(NX.query, NX.reference, NX.qLen, NX.rLen, qRange, rRange, NX.sigMask) NX.centerPass = checkPeakCenter(NX.query, NX.reference, NX.qLen, NX.rLen, qRange, rRange) NX.save()
def updateSimSeqsForUnique(oFN, oFF, seqFN): NX = Nexus(oFN, oFF) NX.load(['sequence']) id_seq = {} f = open(seqFN, 'r') for line in f: ls = line.strip().split('\t') id_seq[int(ls[0])] = ls[1] f.close() while NX.nextID(): NX.sequence = id_seq.get(NX.id, '.') NX.save()
def calculateTotalSNR(dataFN, oFF, mm, iSNRCutoff): mm = str(mm) dataNX = Nexus(dataFN, oFF) dataNX.load(['snr', 'numUFBS', 'numUniqueSims', 'totalNumUFBSSim']) #Check SNR Cutoffs dataIDs = set() lowSNRORNA = set() while dataNX.nextID(): dataIDs.add(dataNX.id) if dataNX.snr < iSNRCutoff: lowSNRORNA.add(dataNX.id) #NOTE sum of avgs != total avg #get total numUFBS for data and simulation total,num unique sims totalUFBSData = 0.0 totalUFBSSim = 0.0 totalPassingORNA = 0 totalUniqueSims = 0 while dataNX.nextID(): if (dataNX.id in lowSNRORNA): continue totalUFBSData += dataNX.numUFBS totalPassingORNA += 1 totalUFBSSim += dataNX.totalNumUFBSSim totalUniqueSims += dataNX.numUniqueSims totalAvgSimUFBS, totalAvgDataUFBS = 0.0, 0.0 try: totalAvgSimUFBS = totalUFBSSim / float(totalUniqueSims) totalAvgDataUFBS = totalUFBSData / float(totalPassingORNA) totalSNR = totalAvgDataUFBS/totalAvgSimUFBS oS = [str(x) for x in [mm, iSNRCutoff, totalUFBSData, totalAvgSimUFBS, totalSNR, totalPassingORNA, float(totalUFBSData)/totalPassingORNA]] except ZeroDivisionError: oS = [str(x) for x in [mm, iSNRCutoff, totalUFBSData, totalAvgSimUFBS, "NA", totalPassingORNA, "NA"]] print '\t'.join(oS)
def updateRepeatStatus(fN, fF, wigDir, chrom, strand): #load oRNAs NX = Nexus(fN, fF) NX.load(['repeat', 'tcc']) #load wig file for chrom, strand coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'REPEAT') while NX.nextID(): oChrom, oStrand, start, end = bioLibCG.tccSplit(NX.tcc) if oChrom != chrom or oStrand != strand: continue NX.repeat = False for i in range(start, end + 1): if i in coord_value: NX.repeat = True break NX.save()
def updateAdjustedMismatches(fN, fF, guValue = .5, otherValue = 1.0): NX = Nexus(fN, fF) NX.load(['sigMask', 'adjustedNumMismatches']) while NX.nextID(): mask = NX.sigMask numGU = mask.count('G') numGapAndMM = mask.count('X') NX.adjustedNumMismatches = (numGU * guValue) + (numGapAndMM * otherValue) NX.save()
def testQuickLoad(fN): ff = ['1 geneName string .', '3 otherIDs intList .', '4 isCoding bool F' ] NX = Nexus(fN, ff) while NX.nextID(): NX.isCoding = True NX.otherIDs = range(18) NX.geneName = "testAuto" NX.save()
def pickBestAlignment(fN, fFN): NX = Nexus(fN, fFN) NX.load(['sID', 'dID', 'score', 'best']) #find best id pair_score = {} #pair : score pair_highID = {} while NX.nextID(): pair = '%s_%s' % (NX.sID, NX.dID) score = NX.score if score > pair_score.get(pair, 0.0): pair_score[pair] = score pair_highID[pair] = NX.id # update best id bestIDs = set(pair_highID.values()) while NX.nextID(): if NX.id in bestIDs: NX.best = True NX.save()
def updateGeneName(dFN, fFN, wigDir, chrom, strand, prefix, switchStrand = False): NX = Nexus(dFN, fFN) NX.load(['geneNames', 'tcc']) if switchStrand: strand = -strand strand = str(strand) coord_gName = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, prefix) while NX.nextID(): chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc) overlappingGenes = coord_gName.get(start, ".") if overlappingGenes == "NONE": NX.geneNames = [] else: NX.geneNames = overlappingGenes.split(',') NX.save()
def calculateTotalSNR(dataFN, oFF, mm, iSNRCutoff): mm = str(mm) dataNX = Nexus(dataFN, oFF) dataNX.load(['snr', 'numUFBS', 'numUniqueSims', 'totalNumUFBSSim']) #Check SNR Cutoffs dataIDs = set() lowSNRORNA = set() while dataNX.nextID(): dataIDs.add(dataNX.id) if dataNX.snr < iSNRCutoff: lowSNRORNA.add(dataNX.id) #NOTE sum of avgs != total avg #get total numUFBS for data and simulation total,num unique sims totalUFBSData = 0.0 totalUFBSSim = 0.0 totalPassingORNA = 0 totalUniqueSims = 0 while dataNX.nextID(): if (dataNX.id in lowSNRORNA): continue totalUFBSData += dataNX.numUFBS totalPassingORNA += 1 totalUFBSSim += dataNX.totalNumUFBSSim totalUniqueSims += dataNX.numUniqueSims totalAvgSimUFBS, totalAvgDataUFBS = 0.0, 0.0 try: totalAvgSimUFBS = totalUFBSSim / float(totalUniqueSims) totalAvgDataUFBS = totalUFBSData / float(totalPassingORNA) totalSNR = totalAvgDataUFBS / totalAvgSimUFBS oS = [ str(x) for x in [ mm, iSNRCutoff, totalUFBSData, totalAvgSimUFBS, totalSNR, totalPassingORNA, float(totalUFBSData) / totalPassingORNA ] ] except ZeroDivisionError: oS = [ str(x) for x in [ mm, iSNRCutoff, totalUFBSData, totalAvgSimUFBS, "NA", totalPassingORNA, "NA" ] ] print '\t'.join(oS)
def updateTargetIDs(oFN, oFF, aFN, aFF): NX = Nexus(oFN, oFF) NX.load(['filteredTargets']) aNX = Nexus(aFN, aFF) aNX.load(['sID']) while aNX.nextID(): NX.id = aNX.sID NX.filteredTargets.append(aNX.id) NX.save()
def updateNumUFBS(oFN, oFF, aFN, aFF): oNX = Nexus(oFN, oFF) oNX.load(['filteredTargets', 'numUFBS']) #just give it some blanks if os.path.getsize(aFN) == 0: while oNX.nextID(): oNX.numUFBS = 0 oNX.save() return aNX = Nexus(aFN, aFF) aNX.load(['sigMask']) while oNX.nextID(): sigMaskSet = set() for aID in oNX.filteredTargets: aNX.id = aID sigMaskSet.add(aNX.sigMask) oNX.numUFBS = len(sigMaskSet) oNX.save()
def linkTargetIDs(oFN, oFF, aFN, aFF): oNX = Nexus(oFN, oFF) oNX.load(['filteredTargets']) #just give it some blanks if os.path.getsize(aFN) == 0: while oNX.nextID(): oNX.filteredTargets = [] oNX.save() return aNX = Nexus(aFN, aFF) aNX.load(['sID']) sID_aIDs = aNX.createMap('sID', 'id', False) for sID, aIDs in sID_aIDs.iteritems(): oNX.id = sID oNX.filteredTargets = aIDs oNX.save()
def testPeaks(degFN, dForm, allGeneInfo, gForm, switchStrand = False): #load/configure gene Info gNX = Nexus(allGeneInfo, gForm) gNX.load(['geneName', 'numReads', 'numSpots']) gName_numReads = {} gName_numSpots = {} while gNX.nextID(): gName_numReads[gNX.geneName] = gNX.numReads gName_numSpots[gNX.geneName] = gNX.numSpots #load degFN info dNX = Nexus(degFN, dForm) dNX.load(['tcc', 'eLevel', 'geneNames', 'pValBin']) while dNX.nextID(): gNames, readsForPeak = dNX.geneNames, dNX.eLevel chrom, strand, start, end = bioLibCG.tccSplit(dNX.tcc) if switchStrand: strand = -int(strand) pVals = [] for gName in gNames: #may have to change gene name cuz of multiple spans try: totGeneReads = gName_numReads[gName] numSpotsForGene = gName_numSpots[gName] except KeyError: try: gName = gName + '_RE_%s_%s' % (chrom, strand) totGeneReads = gName_numReads[gName] numSpotsForGene = gName_numSpots[gName] except KeyError: print "FIX THIS GENE NAME", gName continue #add psuedocount totGeneReads += 1 numSpotsForGene += 1 # not sure whether to do this yet... #check for hidden intron gene overlap try: q = 1.0/numSpotsForGene except ZeroDivisionError: continue #intron gene #add p val pVals.append(binom.sf(readsForPeak, totGeneReads, q)) dNX.pValBin = max(pVals) if pVals else -1.0 dNX.save()
def updateAvgSS(dataFN, oFF, simDir, simBase, mm, numSims=100): #get simulation information (# unique sims, num UFBS) fileNames = [ '%s/simulation.%s/%s.%s' % (simDir, i, simBase, mm) for i in range(numSims) ] sID_numSimUFBS = {} sID_simSeqs = {} for fN in fileNames: oNX = Nexus(fN, oFF) oNX.load(['numUFBS', 'sequence']) while oNX.nextID(): if oNX.sequence in sID_simSeqs.get(oNX.id, set()): pass #dont count again else: sID_numSimUFBS.setdefault(oNX.id, []).append(oNX.numUFBS) sID_simSeqs.setdefault(oNX.id, set()).add(oNX.sequence) #update data based on sim info dataNX = Nexus(dataFN, oFF) dataNX.load(['avgNumSimSS', 'numUniqueSims', 'totalNumUFBSSim']) while dataNX.nextID(): numUniqueSims = len(sID_numSimUFBS.get(dataNX.id, [])) totalSimUFBS = sum(sID_numSimUFBS.get(dataNX.id, [])) avgSimUFBS = totalSimUFBS / float( numUniqueSims) if numUniqueSims != 0 else -1 dataNX.avgNumSimSS = avgSimUFBS dataNX.numUniqueSims = numUniqueSims dataNX.totalNumUFBSSim = totalSimUFBS dataNX.save()
def updateAvgSS(dataFN, oFF, simDir, simBase, mm, numSims = 100): #get simulation information (# unique sims, num UFBS) fileNames = ['%s/simulation.%s/%s.%s' % (simDir, i, simBase, mm) for i in range(numSims)] sID_numSimUFBS = {} sID_simSeqs = {} for fN in fileNames: oNX = Nexus(fN, oFF) oNX.load(['numUFBS', 'sequence']) while oNX.nextID(): if oNX.sequence in sID_simSeqs.get(oNX.id, set()): pass #dont count again else: sID_numSimUFBS.setdefault(oNX.id, []).append(oNX.numUFBS) sID_simSeqs.setdefault(oNX.id, set()).add(oNX.sequence) #update data based on sim info dataNX = Nexus(dataFN, oFF) dataNX.load(['avgNumSimSS', 'numUniqueSims', 'totalNumUFBSSim']) while dataNX.nextID(): numUniqueSims = len(sID_numSimUFBS.get(dataNX.id, [])) totalSimUFBS = sum(sID_numSimUFBS.get(dataNX.id, [])) avgSimUFBS = totalSimUFBS/float(numUniqueSims) if numUniqueSims != 0 else -1 dataNX.avgNumSimSS = avgSimUFBS dataNX.numUniqueSims = numUniqueSims dataNX.totalNumUFBSSim = totalSimUFBS dataNX.save()