def check_ORNA_in_ago(oFN, oFF, agoFN, clippingAmount = 1): NX = Nexus(oFN, oFF) NX.load(['sequence', 'geneNames']) #make truncated sequences id_sequence = NX.createMap('id', 'sequence') if clippingAmount > 0: id_sequence = dict( (i, j[clippingAmount:-clippingAmount]) for i,j in id_sequence.items()) #get fastq sequences agoF = open(agoFN, 'r') agoSeqs = [] while True: fPacket = nextFilePacket(agoF, 4) if not fPacket: break agoSeqs.append(fPacket[1]) agoF.close() #count for each oRNA id_count = {} for id, seq in id_sequence.items(): for agoSeq in agoSeqs: if seq in agoSeq: id_count[id] = id_count.get(id, 0) + 1 #out totalCount = 0 for id, count in id_count.items(): NX.id = id print '%s\t%s\t%s' % (id, count, NX.geneNames) totalCount += count print totalCount
def check_ORNA_in_ago(oFN, oFF, agoFN, clippingAmount=1): NX = Nexus(oFN, oFF) NX.load(['sequence', 'geneNames']) #make truncated sequences id_sequence = NX.createMap('id', 'sequence') if clippingAmount > 0: id_sequence = dict((i, j[clippingAmount:-clippingAmount]) for i, j in id_sequence.items()) #get fastq sequences agoF = open(agoFN, 'r') agoSeqs = [] while True: fPacket = nextFilePacket(agoF, 4) if not fPacket: break agoSeqs.append(fPacket[1]) agoF.close() #count for each oRNA id_count = {} for id, seq in id_sequence.items(): for agoSeq in agoSeqs: if seq in agoSeq: id_count[id] = id_count.get(id, 0) + 1 #out totalCount = 0 for id, count in id_count.items(): NX.id = id print '%s\t%s\t%s' % (id, count, NX.geneNames) totalCount += count print totalCount
def testMap(fN, fF): NX = Nexus(fN, fF) NX.load(['geneName', 'numReads', 'otherIDs']) geneName_numReads = NX.createMap('otherIDs', 'geneName', False) #not 1to1 for k,v in geneName_numReads.iteritems(): print k, v[:5] return
def cleanForSNR(dataFN, oFF): dataNX = Nexus(dataFN, oFF) dataNX.load(['numUniqueSims', 'numUFBS', 'snrClean', 'siblingSet']) id_numUFBS = dataNX.createMap('id', 'numUFBS') id_siblingSet = dataNX.createMap('id', 'siblingSet') unusedSiblings = [] for id, siblingSet in id_siblingSet.iteritems(): if len(siblingSet) == 1: continue #NOTE: oRNA IDs are in their own sibling set numUFBS__id = [(id_numUFBS[x], x) for x in siblingSet] numUFBS__id.sort() numUFBS__id.pop() #take last one (one we're keeping) out of list unusedIDs = [x[1] for x in numUFBS__id] unusedSiblings.extend(unusedIDs) #tag unclean oRNA while dataNX.nextID(): if (dataNX.id in unusedSiblings) or (dataNX.numUniqueSims < 10): dataNX.snrClean = False else: dataNX.snrClean = True dataNX.save()
def updateSimilarSiblings(oFN, oFF, frameLength): dataNX = Nexus(oFN, oFF) dataNX.load(['sequence', 'siblingSet']) oID_sequence = dataNX.createMap('id', 'sequence') consolidatedSets = getSimilarORNASets(oID_sequence, frameLength) for cSet in consolidatedSets: for oID in cSet: dataNX.id = oID dataNX.siblingSet = list(cSet) dataNX.save()
def testConsolidation(oFN, oFF, frameLength): dataNX = Nexus(oFN, oFF) dataNX.load(['sequence']) oID_sequence = dataNX.createMap('id', 'sequence') consolidatedSets = getSimilarORNASets(oID_sequence, frameLength) #check if all oIDs are in set allConsolidatedIDs = set() [allConsolidatedIDs.add(x) for theSet in consolidatedSets for x in theSet] oIDsSet = set(oID_sequence.keys()) print "DIFFERENCE" print oIDsSet.symmetric_difference(allConsolidatedIDs) #check Duplicates #print out sets to verify that they work for oIDSet in consolidatedSets: print print oIDSet for oID in oIDSet: print oID, oID_sequence[oID]
def linkTargetIDs(oFN, oFF, aFN, aFF): oNX = Nexus(oFN, oFF) oNX.load(['filteredTargets']) #just give it some blanks if os.path.getsize(aFN) == 0: while oNX.nextID(): oNX.filteredTargets = [] oNX.save() return aNX = Nexus(aFN, aFF) aNX.load(['sID']) sID_aIDs = aNX.createMap('sID', 'id', False) for sID, aIDs in sID_aIDs.iteritems(): oNX.id = sID oNX.filteredTargets = aIDs oNX.save()