def countBySeed(editFN, microFN, flankAmount, outFN):
        '''FlankF amount should be +/- 6'''
        flankAmount = int(flankAmount)

        eSites = cgEdit.loadEditingSites(editFN)
        micros = cgMicroRNA.loadMicroRNAFromTargetScan(microFN, 'hsa')
        gf = GenomeFetch.GenomeFetch('hg19')

        for eSite in eSites:

                chrom = eSite.chromosome
                coord = eSite.coordinate
                strand = eSite.strand

                flankingSeq = gf.get_seq_from_to(chrom, coord - flankAmount, coord + flankAmount, strand)
                eFlankingSeq = flankingSeq[:flankAmount] + 'G' + flankingSeq[flankAmount + 1:]
                flankingSeq.replace('T', 'U')
                eFlankingSeq.replace('T','U')

                checkID = 'hsa-miR-330-5p'
                
                for microRNA in micros:
                                
                        comSeed = cgSeqMod.reverseComplementSequence(microRNA.seed, True)

                        if comSeed in flankingSeq:
                                eSite.microTargets.append(microRNA.id)
                                if microRNA.id == checkID:
                                        print '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.strand, eSite.gene
                                microRNA.numBefore += 1
                                #print '@', 'flank', eSite.ID, microRNA.id, flankingSeq, comSeed                                

                        if comSeed in eFlankingSeq:
                                microRNA.numAfter += 1
                                if microRNA.id == checkID:
                                        print '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.strand, eSite.gene
                                eSite.eMicroTargets.append(microRNA.id)
                                #print '@', 'eFlank', eSite.ID, microRNA.id, eFlankingSeq, comSeed
                                
        for micro in micros:
                if micro.numBefore > 0 or micro.numAfter > 0:
                        #print micro.id, micro.numBefore, micro.numAfter
                        pass

        #write contents to file...
        outF = open(outFN, 'w')
        for eSite in eSites:
                if len(eSite.microTargets) == 0:
                        targets = 'None'
                else:
                        targets = ','.join(eSite.microTargets)

                if len(eSite.eMicroTargets) == 0:
                        eTargets = 'None'
                else:
                        eTargets = ','.join(eSite.eMicroTargets)

                outF.write('%s\t%s:%s\t%s\t%s\t%s\n' % (eSite.ID, eSite.chromosome, eSite.coordinate, eSite.strand, targets, eTargets))
def updateMicroTargets(editFN, microFN, flankAmount, outFN):
        '''FlankF amount should be +/- 6'''
        flankAmount = int(flankAmount)

        eSites = cgEdit.loadEditingSites(editFN)
        micros = cgMicroRNA.loadMicroRNAFromTargetScan(microFN, 'hsa')
        gf = GenomeFetch.GenomeFetch('hg19')

        for eSite in eSites:

                chrom = eSite.chromosome
                coord = eSite.coordinate
                strand = eSite.strand

                flankingSeq = gf.get_seq_from_to(chrom, coord - flankAmount, coord + flankAmount, strand)
                eFlankingSeq = flankingSeq[:flankAmount] + 'G' + flankingSeq[flankAmount + 1:]
                

                for microRNA in micros:
                        
                        comSeed = cgSeqMod.reverseComplementSequence(microRNA.seed, True)

                        if comSeed in flankingSeq:
                                eSite.microTargets.append(microRNA.id)
                                print '@', 'flank', eSite.ID, microRNA.id, flankingSeq, comSeed                                

                        if comSeed in eFlankingSeq:
                                eSite.eMicroTargets.append(microRNA.id)
                                print '@', 'eFlank', eSite.ID, microRNA.id, eFlankingSeq, comSeed
                                

                
        #write contents to file...
        outF = open(outFN, 'w')
        for eSite in eSites:
                if len(eSite.microTargets) == 0:
                        targets = 'None'
                else:
                        targets = ','.join(eSite.microTargets)

                if len(eSite.eMicroTargets) == 0:
                        eTargets = 'None'
                else:
                        eTargets = ','.join(eSite.eMicroTargets)

                outF.write('%s\t%s:%s\t%s\t%s\t%s\n' % (eSite.ID, eSite.chromosome, eSite.coordinate, eSite.strand, targets, eTargets))
示例#3
0
    def getMRNA(self, coding=False):

        gf = GenomeFetch.GenomeFetch('hg19')
        if self.strand == '1':
            mRNAList = []

            #Get CDS START/END
            try:
                cdsStart = self.utr5[-1][
                    1] + 1  #start of coding is right after utr
            except IndexError:
                cdsStart = int(bioLibCG.tccSplit(self.tcc)[2])
            try:
                cdsEnd = self.utr3[-1][
                    0] - 1  #Check if this is how you added Elements to the UTR
            except IndexError:
                cdsEnd = int(bioLibCG.tccSplit(self.tcc)[3])
            if not coding:
                chrom, strand, start, end = bioLibCG.tccSplit(self.tcc)
                cdsStart = start
                cdsEnd = end

            coordinates = []
            coordinates.extend([cdsStart, cdsEnd])

            for exon in self.exonList:
                for exonCoord in exon:
                    if cdsStart < exonCoord < cdsEnd:
                        coordinates.append(exonCoord)

            #now start at cdsStart and make exon pair for each region till cdsEnd
            coordinates.sort()
            tccStarts = [(coordinates[i * 2])
                         for i in range(len(coordinates) // 2)]
            tccEnds = [
                coordinates[1 + i * 2] for i in range(len(coordinates) // 2)
            ]
            tccPairs = zip(tccStarts, tccEnds)

            for tccPair in tccPairs:
                mRNAList.append(
                    gf.get_seq_from_to(self.chromosome, tccPair[0] + 1,
                                       tccPair[1] + 1))  #gf is 1-based...

            mRNA = ''.join(mRNAList)
            return mRNA.replace('T', 'U')

        if self.strand == '-1':
            '''This this differs in that: 1) the UTR coordinates have to be retrieved differently, 2) The 
                        exon coordinates that will be kept have to be checked with signs reversed, 3) have to revComp it'''

            mRNAList = []

            try:
                cdsStart = self.utr5[-1][0] - 1
            except IndexError:
                cdsStart = int(bioLibCG.tccSplit(self.tcc)[3])
            try:
                cdsEnd = self.utr3[-1][
                    1] + 1  #Check if this is how you added Elements to the UTR
            except IndexError:
                cdsEnd = int(bioLibCG.tccSplit(self.tcc)[2])
            if not coding:
                chrom, strand, start, end = bioLibCG.tccSplit(self.tcc)
                cdsStart = end
                cdsEnd = start
            coordinates = []
            coordinates.extend([cdsStart, cdsEnd])

            for exon in self.exonList:
                for exonCoord in exon:
                    if cdsStart > exonCoord > cdsEnd:  #switched signs for negative
                        coordinates.append(exonCoord)

            #now start at cdsStart and make tccs for each region till cdsEnd
            coordinates.sort()

            #print coordinates
            #print len(coordinates)
            tccStarts = [(coordinates[i * 2])
                         for i in range(len(coordinates) // 2)]
            tccEnds = [
                coordinates[1 + i * 2] for i in range(len(coordinates) // 2)
            ]
            tccPairs = zip(tccStarts, tccEnds)

            #print tccPairs
            for tccPair in tccPairs:
                mRNAList.append(
                    gf.get_seq_from_to(self.chromosome, tccPair[0] + 1,
                                       tccPair[1] + 1))  #gf is 1-based

            mRNA = ''.join(mRNAList)
            mRNA = cgSeqMod.reverseComplementSequence(mRNA, False)
            return mRNA.replace('T', 'U')
示例#4
0
    def getMRNA(self, coding=False):

        gf = GenomeFetch.GenomeFetch("hg19")
        if self.strand == "1":
            mRNAList = []

            # Get CDS START/END
            try:
                cdsStart = self.utr5[-1][1] + 1  # start of coding is right after utr
            except IndexError:
                cdsStart = int(bioLibCG.tccSplit(self.tcc)[2])
            try:
                cdsEnd = self.utr3[-1][0] - 1  # Check if this is how you added Elements to the UTR
            except IndexError:
                cdsEnd = int(bioLibCG.tccSplit(self.tcc)[3])
            if not coding:
                chrom, strand, start, end = bioLibCG.tccSplit(self.tcc)
                cdsStart = start
                cdsEnd = end

            coordinates = []
            coordinates.extend([cdsStart, cdsEnd])

            for exon in self.exonList:
                for exonCoord in exon:
                    if cdsStart < exonCoord < cdsEnd:
                        coordinates.append(exonCoord)

            # now start at cdsStart and make exon pair for each region till cdsEnd
            coordinates.sort()
            tccStarts = [(coordinates[i * 2]) for i in range(len(coordinates) // 2)]
            tccEnds = [coordinates[1 + i * 2] for i in range(len(coordinates) // 2)]
            tccPairs = zip(tccStarts, tccEnds)

            for tccPair in tccPairs:
                mRNAList.append(gf.get_seq_from_to(self.chromosome, tccPair[0] + 1, tccPair[1] + 1))  # gf is 1-based...

            mRNA = "".join(mRNAList)
            return mRNA.replace("T", "U")

        if self.strand == "-1":
            """This this differs in that: 1) the UTR coordinates have to be retrieved differently, 2) The 
                        exon coordinates that will be kept have to be checked with signs reversed, 3) have to revComp it"""

            mRNAList = []

            try:
                cdsStart = self.utr5[-1][0] - 1
            except IndexError:
                cdsStart = int(bioLibCG.tccSplit(self.tcc)[3])
            try:
                cdsEnd = self.utr3[-1][1] + 1  # Check if this is how you added Elements to the UTR
            except IndexError:
                cdsEnd = int(bioLibCG.tccSplit(self.tcc)[2])
            if not coding:
                chrom, strand, start, end = bioLibCG.tccSplit(self.tcc)
                cdsStart = end
                cdsEnd = start
            coordinates = []
            coordinates.extend([cdsStart, cdsEnd])

            for exon in self.exonList:
                for exonCoord in exon:
                    if cdsStart > exonCoord > cdsEnd:  # switched signs for negative
                        coordinates.append(exonCoord)

            # now start at cdsStart and make tccs for each region till cdsEnd
            coordinates.sort()

            # print coordinates
            # print len(coordinates)
            tccStarts = [(coordinates[i * 2]) for i in range(len(coordinates) // 2)]
            tccEnds = [coordinates[1 + i * 2] for i in range(len(coordinates) // 2)]
            tccPairs = zip(tccStarts, tccEnds)

            # print tccPairs
            for tccPair in tccPairs:
                mRNAList.append(gf.get_seq_from_to(self.chromosome, tccPair[0] + 1, tccPair[1] + 1))  # gf is 1-based

            mRNA = "".join(mRNAList)
            mRNA = cgSeqMod.reverseComplementSequence(mRNA, False)
            return mRNA.replace("T", "U")
def checkSeeds(editFN, contextFN, miLocationFN, miSequenceFN, gFN):

        eSites = cgEdit.loadEditingSites(editFN)
        cgEdit.updateContextEditingSites(eSites, contextFN) #puts the UTR, EXON in eSite.context
        
        geneSet = cgGenes3.createGeneSetEditing(gFN)

        tName_t = {}
        for t in geneSet.transcripts:
                tName_t[t.id] = t

        
        miName_miSequence = {}
        f = open(miSequenceFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                name = ls[0]
                seq = ls[1]
                name = 'hsa-' + name
                miName_miSequence[name] = seq

        tName_miInfo = {}
        f = open(miLocationFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tName = ls[0]
                miName = ls[1]
                loc = int(ls[2])
                tName_miInfo.setdefault(tName, []).append([miName, loc])

        foundIt = []
        notFoundIt = []
        for tName in tName_miInfo:
                
                try:
                        t = tName_t[tName]
                except:
                        continue
                checkSeq = get3UTRSeq(t)
                try:
                        mRNA = t.getMRNA()
                except:
                        continue
                for miInfo in tName_miInfo[tName]:
                
                        miName = miInfo[0]
                        loc = miInfo[1]
                        try:
                                miSequence = miName_miSequence[miName]
                                miSeed = miSequence[1:8]
                        except:
                                continue

                        rcMiSeed = cgSeqMod.reverseComplementSequence(miSeed, True)
                        
                        newLoc = loc - (len(mRNA) - len(checkSeq)) 
                        finding = checkSeq.find(rcMiSeed, newLoc - 25)    
                        if finding != -1:
                                if (0 < newLoc - finding < 30):
                                        newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc)
                                        if newResult not in foundIt: foundIt.append(newResult)
                        else:
                                        
                                        if miName == 'hsa-miR-21':
                                                print loc, len(checkSeq), len(mRNA)
                                                print mRNA
                                                print checkSeq
                                               
                                        newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc)
                                        if newResult not in notFoundIt: notFoundIt.append(newResult)

        print len(foundIt)
        print len(notFoundIt)
        print ''
        for i in foundIt:
                print i
        print ''
        for i in notFoundIt:
                print i