Пример #1
0
def test(gFN):

    map = cgSeqMod.loadCodonMap("hg19")
    geneSet = cgGenes3.createGeneSetEditing(gFN)

    for transcript in geneSet.transcripts:
        if "_coding" in transcript.tType:
            try:
                print transcript.id
                print transcript.getMRNA(coding=True)
                print cgSeqMod.translateRNA(transcript.getMRNA(coding=True), map)
            except:
                print "fail"
Пример #2
0
def test(gFN):

    map = cgSeqMod.loadCodonMap('hg19')
    geneSet = cgGenes3.createGeneSetEditing(gFN)

    for transcript in geneSet.transcripts:
        if '_coding' in transcript.tType:
            try:
                print transcript.id
                print transcript.getMRNA(coding=True)
                print cgSeqMod.translateRNA(transcript.getMRNA(coding=True),
                                            map)
            except:
                print 'fail'
Пример #3
0
def testit(gFN):

        geneSet = cgGenes3.createGeneSetEditing(gFN)

        map = cgSeqMod.loadCodonMap('hg19')
        for gene in geneSet.genes:
                for transcript in gene.transcripts:
                        try:
                                print ''
                                mRNA = transcript.getMRNA(coding = True)
                                i = transcript.getRelativePositionMRNA(35872409)
                                if i == -1:
                                        continue
                                print transcript.id
                                print i
                                print mRNA[:i], mRNA[i], mRNA[i + 1:]
                                print cgSeqMod.translateRNA(mRNA, map)
                        except:
                                pass
Пример #4
0
def updateLocationBasedTargets(editFN, contextFN, miLocationFN, gFN):

        eSites = cgEdit.loadEditingSites(editFN)
        cgEdit.updateContextEditingSites(eSites, contextFN) #puts the UTR, EXON in eSite.context
        
        geneSet = cgGenes3.createGeneSetEditing(gFN)

        tName_t = {}
        for t in geneSet.transcripts:
                tName_t[t.id] = t


        tName_miInfo = {}
        f = open(miLocationFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tName = ls[0]
                miName = ls[1]
                loc = int(ls[2])
                tName_miInfo.setdefault(tName, []).append([miName, loc])


        for eSite in eSites:
                if '3UTR' not in eSite.context:
                        continue
                for tName in eSite.transcripts:
                        if tName in tName_miInfo:
                                
                                t = tName_t[tName]
                                
                                for info in tName_miInfo[tName]:
                                        
                                        
                                        miName = info[0]
                                        loc = info[1]

                                        #get the position of e site in mrna for this transcript
                                        ePosition = t.getRelativePositionMRNA(eSite.coordinate, coding = False)
                                        
                                        print tName, miName, loc, ePosition
                                        if loc - 22 <= ePosition <= loc:
                                                print tName, miName, '%s:%s' % (eSite.chromosome, eSite.coordinate)
                                                pass
Пример #5
0
def getCoords(gFN):

    geneSet = cgGenes3.createGeneSetEditing(gFN)

    for gName in geneSet.set:
        gene = geneSet.set[gName]

        # find the longest utr...
        longestUTR = None
        longestT = None
        longest = 0
        for transcript in gene.transcripts:
            l = 0
            if transcript.utr3 is None:
                continue
            for utrPair in transcript.utr3:
                l += utrPair[1] - utrPair[0] + 1
            if l > longest:
                longestUTR = transcript.utr3
                longestT = transcript

        # Get the coordinate ends/etc
        starts, ends = [], []
        if longestUTR is None:
            continue
        for utrPair in longestUTR:
            starts.append(utrPair[0])
            ends.append(utrPair[1])

        starts.sort()
        ends.sort()

        startS = ",".join([str(x) for x in starts])
        endS = ",".join([str(x) for x in ends])

        print "%s\t%s\t%s\t%s\t%s\t%s" % (
            transcript.id,
            transcript.parent,
            transcript.chromosome,
            transcript.strand,
            startS,
            endS,
        )
Пример #6
0
def checkIfSeedPresent(gName_micros, gFN):

        gf = GenomeFetch.GenomeFetch('hg19')

        print 'loading gene set'
        geneSet = cgGenes3.createGeneSetEditing(gFN)
        print '....done loading'

        outF = open('utrSeeds', 'w')
        done = {}
        for gName in gName_micros:
                micros = gName_micros[gName]
                for transcript in geneSet.set[gName].transcripts:
                        
                        utrCoords = []
                        if len(transcript.utr3) == 0:
                                continue
                        for utrPair in transcript.utr3:
                                utrCoords.extend(utrPair)
                        utrCoords.sort()
                        start, end = utrCoords[0], utrCoords[1]
                        chrom = transcript.chromosome
                        strand = transcript.strand

                        checkSeq = gf.get_seq_from_to(chrom, start, end, strand)
                        print ''
                        print transcript.id
                        print checkSeq
                        checkSeq = checkSeq.replace('T', 'U')

                        for micro in micros:
                                
                                findings = checkSeq.find(micro.comSeed)
                                if findings != -1:
                                        outF.write('%s\t%s\t%s\n' % ( transcript.id, micro.name, transcript.parent))
                                        found = findings
Пример #7
0
def getCoords(gFN):

    geneSet = cgGenes3.createGeneSetEditing(gFN)

    for gName in geneSet.set:
        gene = geneSet.set[gName]

        #find the longest utr...
        longestUTR = None
        longestT = None
        longest = 0
        for transcript in gene.transcripts:
            l = 0
            if transcript.utr3 is None: continue
            for utrPair in transcript.utr3:
                l += utrPair[1] - utrPair[0] + 1
            if l > longest:
                longestUTR = transcript.utr3
                longestT = transcript

        #Get the coordinate ends/etc
        starts, ends = [], []
        if longestUTR is None: continue
        for utrPair in longestUTR:
            starts.append(utrPair[0])
            ends.append(utrPair[1])

        starts.sort()
        ends.sort()

        startS = ','.join([str(x) for x in starts])
        endS = ','.join([str(x) for x in ends])

        print '%s\t%s\t%s\t%s\t%s\t%s' % (transcript.id, transcript.parent,
                                          transcript.chromosome,
                                          transcript.strand, startS, endS)
Пример #8
0
def newExpression(eFN, gFN, contextFN, targetFN, rpkmFN):
        """docstring for newExpression"""

        #print 'loading editing/genes'        
        eSites = cgEdit.loadEditingSites(eFN)
        geneSet = cgGenes3.createGeneSetEditing(gFN)


        #print 'making joints'
        #joints
        eID_eSite = {}
        for eSite in eSites:
                eID_eSite[eSite.ID] = eSite



        gID_eIDs = {}
        eID_tTypes = {}
        f = open(contextFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                gID = ls[1]
                eID = int(ls[0])
                tType = ls[3]

                if gID in gID_eIDs:
                        if eID not in gID_eIDs[gID]:
                                gID_eIDs[gID].append(eID)
                else:
                        gID_eIDs[gID] = [eID]

                if eID in eID_tTypes:
                        eID_tTypes[eID].append(tType)
                else:
                        eID_tTypes[eID] = [tType]

        #print 'updating target sites'
        #update targetting for eSites:
        f = open(targetFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                eID = int(ls[0])
                before = ls[3].split(',')
                if before == ['None']: before = []
                after = ls[4].split(',')
                if after == ['None']: after = []
                eID_eSite[eID].before = before
                eID_eSite[eID].after = after

        #scrolldict
        gene_eSites = {}
        for gID in gID_eIDs:
                try:
                        gene = geneSet.set[gID]
                except KeyError:
                        #print gID, 'not in geneSet'
                        pass

                gene_eSites[gene] = []
                #get eSites
                for eID in gID_eIDs[gID]:
                        eSite = eID_eSite[eID]
                        gene_eSites[gene].append(eSite)

        #print 'updating gene target site info'
        #update before/after editing target sites for GENES
        createdGenes = []
        destroyedGenes = []
        histoVals = []

        for gene in gene_eSites:
                gene.before = []
                gene.after = []
                for eSite in gene_eSites[gene]:
                        if '3UTR' in eID_tTypes[eSite.ID]:
                                for micro in eSite.before:
                                        gene.before.append('%s %s:%s' % (micro, eSite.ID, eSite.coordinate))
                                for micro in eSite.after:
                                        gene.after.append('%s %s:%s' % (micro, eSite.ID, eSite.coordinate))

                if len(gene.before) > len(gene.after):
                        destroyedGenes.append(gene)

                if len(gene.after) > len(gene.before):
                        createdGenes.append(gene)

                change = len(gene.after) - len(gene.before)
                if (len(gene.after) != 0) or (len(gene.before) != 0):
                        histoVals.append(change)

        #print the created/destroyed sites
        for gene in createdGenes:
                print gene.before
                print gene.after

        for gene in destroyedGenes:
                print gene.before
                print gene.after

        '''
        plt.title('Target Site Changes Due To Editing')
        plt.xlabel('Change in Number of Target Sites After Editing')
        plt.ylabel('Number of Genes')
        plt.hist(histoVals, 6)
        plt.show()

        return 0
        '''

        #print out microRNA gene List
        uniqueMicros = {}
        for gene in createdGenes:
                for micro in gene.after:
                        uniqueMicros[micro] = 1
        for gene in destroyedGenes:
                for micro in gene.before:
                        uniqueMicros[micro] = 1

        for micro in uniqueMicros:
                #print micro
                pass

        #print 'loading rpkm'
        gName_ratio = getERatioDict(rpkmFN)
        eChanges = []
        
        for gene in createdGenes:
                try:
                        ratio = gName_ratio[gene.id]
                except KeyError:
                        #print gene.id, 'not in RPKM file --> not expressed'
                        pass
                eChange = math.log(ratio, 2)
                eChanges.append(eChange)
        
        eChanges2 = []
        for gene in destroyedGenes:
                try:
                        ratio = gName_ratio[gene.id]
                except KeyError:
                        #print gene.id, 'not in RPKM file --> not expressed'
                        pass

                eChange = math.log(ratio, 2)
                eChanges2.append(eChange)
        

        #Now plot the histogram
        plt.hist(eChanges, 40, cumulative = True, histtype = 'step', normed = True, label = 'Created')
        plt.hist(eChanges2, 40, cumulative = True, histtype = 'step', normed = True, label = 'Destroyed')
        plt.legend()
        plt.title('eCDF of Genes with Target Site Changes in 3UTRs')
        plt.xlabel('log2(RPKM KD/ RPKM CONTROL)')
        plt.ylabel('Fraction of Genes')
        plt.show()
Пример #9
0
def updateContext(editFN, geneSetFN, outFN, refBase='A'):
    print refBase
    #Load Transcripts and Editing Sites
    print 'loading editing sites'
    eSites = cgEdit.loadEditingSites(editFN, refBase)
    print 'loading gene set'
    geneSet = cgGenes3.createGeneSetEditing(geneSetFN)

    #make the eSites 0 based
    for eSite in eSites:
        #redo coordinate and tcc
        eSite.coordinate = eSite.coordinate - 1
        eSite.tcc = bioLibCG.makeTcc(eSite.chromosome, eSite.strand,
                                     eSite.coordinate, eSite.coordinate)

    #Create Joint dictionaries
    print 'creating joint dictionaries'
    eJoint = {}  #tcc : eSite
    for eSite in eSites:
        eJoint[eSite.tcc] = eSite

    tJoint = {}  # tcc : [transcript, ...]
    for transcript in geneSet.transcripts:
        if transcript.tcc in tJoint:
            tJoint[transcript.tcc].append(transcript)
        else:
            tJoint[transcript.tcc] = [transcript]

    #Overlap tccs
    print 'overlapping joints'
    ##make new 0-based keys
    tccOverlaps = compareData.getIndividualOverlaps(eJoint.keys(),
                                                    tJoint.keys(), 1)

    print 'creating final dictionary'
    #create final dictionary containing {edit sites : [transcript, ..]}
    eSiteTranscripts = {}  # edit site: [transcript, ..]
    for eTcc in tccOverlaps:
        eSite = eJoint[eTcc]
        eSiteTranscripts[eSite] = []
        for tTcc in tccOverlaps[eTcc]:
            eSiteTranscripts[eSite].extend(tJoint[tTcc])

    print 'get context info'
    #Go through each site and find out what it overlaps, and if it is in a coding region...
    fOut = open(outFN, 'w')
    for eSite in eSiteTranscripts:
        if len(eSiteTranscripts[eSite]) == 0:
            #label intergenic
            tType = 'INTER'
            codingFlag = 'NC'
            fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                       (eSite.ID, 'NONE', 'NONE', tType, codingFlag, 'NONE'))
            continue

        for transcript in eSiteTranscripts[eSite]:

            codingTranscript = '_coding' in transcript.tType
            tType = None
            codingFlag = None
            tTypes = [
                x[1] for x in transcript.getOverlappingElements(eSite.tcc)
            ]

            if '3UTR' in tTypes:
                tType = '3UTR'
            elif '5UTR' in tTypes:
                tType = '5UTR'
            else:
                tType = tTypes[0]  #has to be one thing...exon or intron
            #This only works because UTR takes precedence over EXON in TYPE.
            if tType == 'EXON':
                if codingTranscript:
                    codingFlag = 'C'
                else:
                    codingFlag = 'NC'
            else:
                codingFlag = 'NC'

            fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                       (eSite.ID, transcript.parent, transcript.id, tType,
                        codingFlag, transcript.tType))
            #fOut.write('%s:%s:%s\t%s\t%s\t%s\t%s\n' % (eSite.chromosome, eSite.strand, eSite.coordinate, transcript.parent, transcript.id, tType, codingFlag))

    fOut.close()
Пример #10
0
def doit(fN):
    geneSet = cgGenes3.createGeneSetEditing(fN)

    for transcript in geneSet.transcripts:
        if transcript.id == 'NM_031422':
            dumpObj.dumpObj(transcript)
Пример #11
0
def makeContextPieBetter(contextFN, gFN, eFN, passedInfo, outFN):
      
        if passedInfo[0] == passedInfo[1]:
                print passedInfo, 'no such thing'
                return 0

        print 'loading geneSet'
        geneSet = cgGenes3.createGeneSetEditing(gFN)
        typeCount = {'EXON': 0, '3UTR': 0, '5UTR': 0, 'INTRON': 0, 'NONG': 0, 'NONT': 0}

        #joint
        tID_transcript = {}
        for transcript in geneSet.transcripts:
                tID_transcript[transcript.id] = transcript

        eID_Info = {}
        f = open(eFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                eID = int(ls[13])
                chrom = ls[0]
                coord = ls[1]
                newCoord = '%s:%s' % (chrom, coord)
                nEdited = ls[4]
                nTotal = ls[5]
                eID_Info[eID] = [newCoord, nEdited, nTotal]

        eID_gName = {}
        eID_tTypes = {}
        f = open(contextFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                eID = int(ls[0])
                gName = ls[1]
                eID_gName[eID] = gName
                tType = ls[3]
                tName = ls[2]
                if tType == 'INTER':
                        continue
                transcript = tID_transcript[tName]
                tInfo = [tType, transcript]

                if eID in eID_tTypes:
                        eID_tTypes[eID].append(tInfo)
                else:
                        eID_tTypes[eID] = [tInfo]

        eID_finalType = {}

        for eID in eID_tTypes:
                highestType = None
                for tInfo in eID_tTypes[eID]:
                        tType = tInfo[0]
                        transcript = tInfo[1]
                        
                        tCoding = True
                        if '_coding' not in transcript.tType:
                                tCoding = False
                        
                        gCoding = True
                        if '_coding' not in transcript.gType:
                                gCoding = False

                        #print transcript.id, tType, tCoding, gCoding
                        
                        if tType == 'EXON':
                                if tCoding:
                                        highestType = 'EXON'
                                        break
                                else:
                                        if gCoding:
                                                if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR']:
                                                        highestType = 'NONT'
                                        else:
                                                if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR', 'NONT']:
                                                        highestType = 'NONG'

                        elif tType == '3UTR':
                                if tCoding:
                                        if gCoding:
                                                highestType = '3UTR'
                                        else:
                                                highestType = '3UTR'
                                else:
                                        if gCoding:
                                                if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR']:     
                                                        highestType = 'NONT'
                                        else:
                                                if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR', 'NONT']:
                                                        highestType = 'NONG'
                        elif tType == '5UTR':
                                if tCoding:
                                        if gCoding:
                                                if highestType not in ['3UTR']:
                                                        highestType = '5UTR'
                                        else:
                                                if highestType not in ['3UTR']:
                                                        highestType = '5UTR'
                                else:
                                        if gCoding:
                                                if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR']:        
                                                        highestType = 'NONT'
                                        else:
                                                if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR', 'NONT']:
                                                        highestType = 'NONG'
                        
                        elif tType == 'INTRON':

                                if tCoding:
                                        if gCoding:
                                                if highestType not in ['3UTR', '5UTR']:
                                                        highestType = 'INTRON'
                                        else:
                                                if highestType not in ['3UTR', '5UTR']:
                                                        highestType = 'INTRON'
                                else:
                                        if gCoding:
                                                if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR']:     
                                                        highestType = 'NONT'
                                        else:
                                                if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR', 'NONT']:
                                                        highestType = 'NONG'
                eID_finalType[eID] = highestType
                typeCount[highestType] += 1


        outF = open(outFN, 'w')
        for eID in eID_finalType:
                outF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (eID_Info[eID][0].split(':')[0], eID_Info[eID][0].split(':')[1], passedInfo, eID_gName[eID], eID_finalType[eID], eID_Info[eID][1], eID_Info[eID][2]))

        return 0                


        outF = open(outFN, 'w')
        for type in typeCount:
                outF.write('%s\t%s\n' % (type, typeCount[type]))
        return 0 

        #get fractions of each type
        types = ['EXON', '3UTR', '5UTR', 'INTRON', 'NONG', 'NONT']
        fracs = [typeCount['EXON'], typeCount['3UTR'], typeCount['5UTR'], typeCount['INTRON'], typeCount['NONG'], typeCount['NONT']]
        #print fracs
        labels = ['Exons (%s)' % fracs[0], '3\'UTR (%s)' % fracs[1], '5\'UTR (%s)' % fracs[2], 'Introns (%s)' % fracs[3], 'Noncoding Gene (%s)' % fracs[4], 'Noncoding Transcript (%s)' % fracs[5]] 
        theSum = fracs[0] + fracs[1] + fracs[2] + fracs[3] + fracs[4] + fracs[5]
        fracs = [float(x)/theSum for x in fracs]

        #print fracs


        explode=(0.1, 0.1, 0.1, 0.1, .1, .1)
        pie(fracs, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True)
        title('Editing Site Genomic Location', bbox={'facecolor':'1.0', 'pad':10})

        show()
Пример #12
0
def makeTargetExpressionHistogram(eFN, targetFN, contextFN, geneFN, eChangeFN):

    print 'loading expression ratios'
    gName_eChange = getERatioDict(eChangeFN)

    print 'loading eSites and Transcripts'
    eSites = cgEdit.loadEditingSites(eFN)
    geneSet = cgGenes3.createGeneSetEditing(geneFN)

    print 'making joint dicts and loading extra data'
    #joint
    eID_eSite = {}
    for eSite in eSites:
        eID_eSite[eSite.ID] = eSite

    #joint
    tID_gName = {}
    for transcript in geneSet.transcripts:
        tID_gName[transcript.id] = transcript.parent

    #load context data
    f = open(contextFN, 'r')
    eID_tID = {}  # eID: tID
    tID_tType = {}
    for line in f:
        ls = line.strip().split('\t')
        eID = int(ls[0])
        tID = ls[1]
        tType = ls[2]
        tID_tType[tID] = tType
        if eID in eID_tID:
            eID_tID[eID].append(tID)
        else:
            eID_tID[eID] = [tID]
    f.close()

    print 'analyzing'
    #Get created or destroyed
    f = open(targetFN, 'r')
    altered = []
    for line in f:
        ls = line.strip().split('\t')
        #created/destroyed
        if (ls[3] != 'None') and (ls[4] == 'None'):
            altered.append(int(ls[0]))
    f.close()

    print 'number of created/destroyed sites:', len(altered)

    alteredSites = []
    for id in altered:
        alteredSites.append(eID_eSite[id])

    eChanges = []
    gDone = []
    #Get gene names for each eSite
    for eSite in alteredSites:
        genes = []
        for tID in eID_tID[eSite.ID]:
            if tID == 'NONE':
                continue

            gName = tID_gName[tID]
            if tID_tType[tID] != '3UTR':
                print 'Not 3UTR', tID
                continue
            if gName not in genes:
                genes.append(gName)

        if len(genes) > 1:
            print 'more than one gene for eSite...', genes
            continue

        if gName in gDone:
            continue
        else:
            gDone.append(gName)

        #Now add expression to HistoGram List...
        if gName in gName_eChange:
            eChange = gName_eChange[gName]
        else:
            print 'gene not in expression list', gName
            continue
        eChange = gName_eChange[gName]
        eChange = math.log(eChange, 2)
        eChanges.append(eChange)

    #Now plot the histogram
    plt.hist(eChanges, 40)
    plt.xlabel('log2(RPKM KD/ RPKM CONTROL)')
    plt.ylabel('# Genes')
    plt.show()
Пример #13
0
def updateContext(editFN, geneSetFN, outFN, refBase = 'A'):
        print refBase
        #Load Transcripts and Editing Sites
        print 'loading editing sites'
        eSites = cgEdit.loadEditingSites(editFN, refBase)
        print 'loading gene set'
        geneSet = cgGenes3.createGeneSetEditing(geneSetFN)

        
        #make the eSites 0 based
        for eSite in eSites:
                #redo coordinate and tcc
                eSite.coordinate = eSite.coordinate - 1
                eSite.tcc = bioLibCG.makeTcc(eSite.chromosome, eSite.strand, eSite.coordinate, eSite.coordinate)

        #Create Joint dictionaries
        print 'creating joint dictionaries'
        eJoint = {} #tcc : eSite
        for eSite in eSites:
                eJoint[eSite.tcc] = eSite

        tJoint = {} # tcc : [transcript, ...]
        for transcript in geneSet.transcripts:
                if transcript.tcc in tJoint:
                        tJoint[transcript.tcc].append(transcript)
                else:
                        tJoint[transcript.tcc] = [transcript]

        #Overlap tccs
        print 'overlapping joints'
        ##make new 0-based keys
        tccOverlaps = compareData.getIndividualOverlaps(eJoint.keys(), tJoint.keys(), 1)


        print 'creating final dictionary'
        #create final dictionary containing {edit sites : [transcript, ..]}
        eSiteTranscripts = {} # edit site: [transcript, ..]
        for eTcc in tccOverlaps:
                eSite = eJoint[eTcc]
                eSiteTranscripts[eSite] = []
                for tTcc in tccOverlaps[eTcc]:
                        eSiteTranscripts[eSite].extend(tJoint[tTcc])

        print 'get context info'
        #Go through each site and find out what it overlaps, and if it is in a coding region...
        fOut = open(outFN, 'w')
        for eSite in eSiteTranscripts:
                if len(eSiteTranscripts[eSite]) == 0:
                        #label intergenic
                        tType = 'INTER'
                        codingFlag = 'NC'
                        fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, 'NONE', 'NONE', tType, codingFlag, 'NONE'))
                        continue

                for transcript in eSiteTranscripts[eSite]:
                       
                        codingTranscript =  '_coding' in transcript.tType
                        tType = None
                        codingFlag = None
                        tTypes = [ x[1] for x in transcript.getOverlappingElements(eSite.tcc)]
                        

                        if '3UTR' in tTypes:
                                tType = '3UTR'
                        elif '5UTR' in tTypes:
                                tType = '5UTR'
                        else:
                                tType = tTypes[0] #has to be one thing...exon or intron
                        #This only works because UTR takes precedence over EXON in TYPE.
                        if tType == 'EXON':
                                if codingTranscript:
                                        codingFlag = 'C'
                                else:
                                        codingFlag = 'NC'
                        else:
                                codingFlag = 'NC'


                        fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, transcript.parent, transcript.id, tType, codingFlag, transcript.tType))
                        #fOut.write('%s:%s:%s\t%s\t%s\t%s\t%s\n' % (eSite.chromosome, eSite.strand, eSite.coordinate, transcript.parent, transcript.id, tType, codingFlag))

        fOut.close()
Пример #14
0
def updateContext(oDir, geneSetFN):
       
        print 'loading oRNA'
        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()
        print 'loading gene set'
        geneSet = cgGenes3.createGeneSetEditing(geneSetFN)

        
        #Get in terms of tccs
        print 'Joining'
        oTcc_oRNA = oneToOne(id_oRNA.values(), 'tcc')
        tTcc_transcripts = oneToMany(geneSet.transcripts, 'tcc')

        #Overlap tccs
        print 'overlapping'
        oTcc_tTccs = compareData.getIndividualOverlaps(oTcc_oRNA.keys(), tTcc_transcripts.keys(), 1)


        #create final dictionary containing {oRNA : [transcript, ..]}
        oRNA_transcripts = {}
        for oTcc in oTcc_tTccs:
                oRNA = oTcc_oRNA[oTcc]
                oRNA_transcripts[oRNA] = []
                for tTcc in oTcc_tTccs[oTcc]:
                        oRNA_transcripts[oRNA].extend(tTcc_transcripts[tTcc])

        print 'get context info'
        #Go through each site and find out what it overlaps, and if it is in a coding region...
        
        ds = bioLibCG.dominantSpotter(['EXON_INTRON', '3UTR', '5UTR', 'EXON', 'INTRON'])
        for oRNA in oRNA_transcripts:

                oRNA.transcriptIDs = []
                oRNA.transcriptContexts = []
                oRNA.transcriptTypes = []
                oRNA.transcriptCodingTypes = []
                
                if len(oRNA_transcripts[oRNA]) == 0:
                        continue

                for transcript in oRNA_transcripts[oRNA]:
                       
                        codingTranscript =  '_coding' in transcript.tType
                        tType = None
                        codingFlag = None

                        tTypes = [x[1] for x in transcript.getOverlappingElements(oRNA.tcc)]
                        
                        #categorize border types
                       
                        tType = ds.spotItem(tTypes)
                          
                        if tType == 'EXON' or 'EXON_INTRON':
                                if codingTranscript:
                                        codingFlag = 'C'
                                else:
                                        codingFlag = 'NC'
                        else:
                                codingFlag = 'NC'

                        
                        oRNA.transcriptIDs.append(transcript.id)
                        oRNA.transcriptContexts.append(tType)
                        oRNA.transcriptTypes.append(transcript.tType)
                        oRNA.transcriptCodingTypes.append(codingFlag)

        oDC.commit(id_oRNA)
Пример #15
0
def checkSeeds(editFN, contextFN, miLocationFN, miSequenceFN, gFN):

        eSites = cgEdit.loadEditingSites(editFN)
        cgEdit.updateContextEditingSites(eSites, contextFN) #puts the UTR, EXON in eSite.context
        
        geneSet = cgGenes3.createGeneSetEditing(gFN)

        tName_t = {}
        for t in geneSet.transcripts:
                tName_t[t.id] = t

        
        miName_miSequence = {}
        f = open(miSequenceFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                name = ls[0]
                seq = ls[1]
                name = 'hsa-' + name
                miName_miSequence[name] = seq

        tName_miInfo = {}
        f = open(miLocationFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tName = ls[0]
                miName = ls[1]
                loc = int(ls[2])
                tName_miInfo.setdefault(tName, []).append([miName, loc])

        foundIt = []
        notFoundIt = []
        for tName in tName_miInfo:
                
                try:
                        t = tName_t[tName]
                except:
                        continue
                checkSeq = get3UTRSeq(t)
                try:
                        mRNA = t.getMRNA()
                except:
                        continue
                for miInfo in tName_miInfo[tName]:
                
                        miName = miInfo[0]
                        loc = miInfo[1]
                        try:
                                miSequence = miName_miSequence[miName]
                                miSeed = miSequence[1:8]
                        except:
                                continue

                        rcMiSeed = cgSeqMod.reverseComplementSequence(miSeed, True)
                        
                        newLoc = loc - (len(mRNA) - len(checkSeq)) 
                        finding = checkSeq.find(rcMiSeed, newLoc - 25)    
                        if finding != -1:
                                if (0 < newLoc - finding < 30):
                                        newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc)
                                        if newResult not in foundIt: foundIt.append(newResult)
                        else:
                                        
                                        if miName == 'hsa-miR-21':
                                                print loc, len(checkSeq), len(mRNA)
                                                print mRNA
                                                print checkSeq
                                               
                                        newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc)
                                        if newResult not in notFoundIt: notFoundIt.append(newResult)

        print len(foundIt)
        print len(notFoundIt)
        print ''
        for i in foundIt:
                print i
        print ''
        for i in notFoundIt:
                print i
Пример #16
0
def makeContextPieBetter(contextFN, gFN, eFN, passedInfo, outFN):

    if passedInfo[0] == passedInfo[1]:
        print passedInfo, 'no such thing'
        return 0

    print 'loading geneSet'
    geneSet = cgGenes3.createGeneSetEditing(gFN)
    typeCount = {
        'EXON': 0,
        '3UTR': 0,
        '5UTR': 0,
        'INTRON': 0,
        'NONG': 0,
        'NONT': 0
    }

    #joint
    tID_transcript = {}
    for transcript in geneSet.transcripts:
        tID_transcript[transcript.id] = transcript

    eID_Info = {}
    f = open(eFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        eID = int(ls[13])
        chrom = ls[0]
        coord = ls[1]
        newCoord = '%s:%s' % (chrom, coord)
        nEdited = ls[4]
        nTotal = ls[5]
        eID_Info[eID] = [newCoord, nEdited, nTotal]

    eID_gName = {}
    eID_tTypes = {}
    f = open(contextFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        eID = int(ls[0])
        gName = ls[1]
        eID_gName[eID] = gName
        tType = ls[3]
        tName = ls[2]
        if tType == 'INTER':
            continue
        transcript = tID_transcript[tName]
        tInfo = [tType, transcript]

        if eID in eID_tTypes:
            eID_tTypes[eID].append(tInfo)
        else:
            eID_tTypes[eID] = [tInfo]

    eID_finalType = {}

    for eID in eID_tTypes:
        highestType = None
        for tInfo in eID_tTypes[eID]:
            tType = tInfo[0]
            transcript = tInfo[1]

            tCoding = True
            if '_coding' not in transcript.tType:
                tCoding = False

            gCoding = True
            if '_coding' not in transcript.gType:
                gCoding = False

            #print transcript.id, tType, tCoding, gCoding

            if tType == 'EXON':
                if tCoding:
                    highestType = 'EXON'
                    break
                else:
                    if gCoding:
                        if highestType not in [
                                'EXON', 'INTRON', '3UTR', '5UTR'
                        ]:
                            highestType = 'NONT'
                    else:
                        if highestType not in [
                                'EXON', 'INTRON', '3UTR', '5UTR', 'NONT'
                        ]:
                            highestType = 'NONG'

            elif tType == '3UTR':
                if tCoding:
                    if gCoding:
                        highestType = '3UTR'
                    else:
                        highestType = '3UTR'
                else:
                    if gCoding:
                        if highestType not in [
                                'EXON', 'INTRON', '3UTR', '5UTR'
                        ]:
                            highestType = 'NONT'
                    else:
                        if highestType not in [
                                'EXON', 'INTRON', '3UTR', '5UTR', 'NONT'
                        ]:
                            highestType = 'NONG'
            elif tType == '5UTR':
                if tCoding:
                    if gCoding:
                        if highestType not in ['3UTR']:
                            highestType = '5UTR'
                    else:
                        if highestType not in ['3UTR']:
                            highestType = '5UTR'
                else:
                    if gCoding:
                        if highestType not in [
                                'EXON', 'INTRON', '3UTR', '5UTR'
                        ]:
                            highestType = 'NONT'
                    else:
                        if highestType not in [
                                'EXON', 'INTRON', '3UTR', '5UTR', 'NONT'
                        ]:
                            highestType = 'NONG'

            elif tType == 'INTRON':

                if tCoding:
                    if gCoding:
                        if highestType not in ['3UTR', '5UTR']:
                            highestType = 'INTRON'
                    else:
                        if highestType not in ['3UTR', '5UTR']:
                            highestType = 'INTRON'
                else:
                    if gCoding:
                        if highestType not in [
                                'EXON', 'INTRON', '3UTR', '5UTR'
                        ]:
                            highestType = 'NONT'
                    else:
                        if highestType not in [
                                'EXON', 'INTRON', '3UTR', '5UTR', 'NONT'
                        ]:
                            highestType = 'NONG'
        eID_finalType[eID] = highestType
        typeCount[highestType] += 1

    outF = open(outFN, 'w')
    for eID in eID_finalType:
        outF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                   (eID_Info[eID][0].split(':')[0],
                    eID_Info[eID][0].split(':')[1], passedInfo, eID_gName[eID],
                    eID_finalType[eID], eID_Info[eID][1], eID_Info[eID][2]))

    return 0

    outF = open(outFN, 'w')
    for type in typeCount:
        outF.write('%s\t%s\n' % (type, typeCount[type]))
    return 0

    #get fractions of each type
    types = ['EXON', '3UTR', '5UTR', 'INTRON', 'NONG', 'NONT']
    fracs = [
        typeCount['EXON'], typeCount['3UTR'], typeCount['5UTR'],
        typeCount['INTRON'], typeCount['NONG'], typeCount['NONT']
    ]
    #print fracs
    labels = [
        'Exons (%s)' % fracs[0],
        '3\'UTR (%s)' % fracs[1],
        '5\'UTR (%s)' % fracs[2],
        'Introns (%s)' % fracs[3],
        'Noncoding Gene (%s)' % fracs[4],
        'Noncoding Transcript (%s)' % fracs[5]
    ]
    theSum = fracs[0] + fracs[1] + fracs[2] + fracs[3] + fracs[4] + fracs[5]
    fracs = [float(x) / theSum for x in fracs]

    #print fracs

    explode = (0.1, 0.1, 0.1, 0.1, .1, .1)
    pie(fracs, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True)
    title('Editing Site Genomic Location',
          bbox={
              'facecolor': '1.0',
              'pad': 10
          })

    show()
Пример #17
0
def updateSynonomous(eFN, gFN, resultsFN, outFN):

    #Load Transcripts and Editing Sites
    print 'Loading editing sites'
    eSites = cgEdit.loadEditingSites(eFN)
    print 'Loading gene set'
    geneSet = cgGenes3.createGeneSetEditing(gFN)

    codingTID_eID = {}
    f = open(resultsFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        if ls[4] == 'C':
            codingTID_eID[ls[2]] = int(ls[0])

    #Get coding Transcripts
    codingTranscripts = {}  #tID : eID ! many:one always!
    f = open(resultsFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        if ls[4] == 'C':
            codingTranscripts[ls[2]] = int(ls[0])

    eID_eSite = {}
    for eSite in eSites:
        eID_eSite[eSite.ID] = eSite

    tID_transcript = {}
    for transcript in geneSet.transcripts:
        tID[transcript.id] = transcript

    codingT_eSite
    for tID in codingTID_eID:
        eID = codingTID_eID[tID]
        t = tID_transcript[tID]
        e = eID_eSite[eID]

    print 'Creating scroll dict'
    scrollDict = {}  # transcript: eSite
    for tID in codingTranscripts:
        e = eJoint[codingTranscripts[tID]]
        try:
            t = tJoint[tID]
            scrollDict[t] = e
        except KeyError:
            pass

    print 'Deducing synonomous'
    map = cgSeqMod.loadCodonMap('hg19')
    finalDict = {}  # tID: [SYN, AAA, AAB, G, A]
    #Figure out if they are synonomous
    for t in scrollDict:

        eSite = scrollDict[t]
        #dumpObj.dumpObj(t)
        #dumpObj.dumpObj(eSite)

        ePositionInMRNA = t.getRelativePositionMRNA(eSite.coordinate - 1)

        if ePositionInMRNA == -1:
            print t.id, 'should not be designated coding...'
            continue

        #grab mRNA and emRNA
        mRNA = t.getMRNA(coding=True)
        emRNA = t.getMRNA(coding=True)

        if mRNA[ePositionInMRNA] != 'A':
            print 'wrong position', t.id, '%s:%s' % (
                eSite.chromosome, eSite.coordinate), eSite.strand, mRNA[
                    ePositionInMRNA - 5:ePositionInMRNA -
                    1], mRNA[ePositionInMRNA], mRNA[ePositionInMRNA +
                                                    1:ePositionInMRNA + 5]

        #edit the site
        emRNA = list(emRNA)
        emRNA[ePositionInMRNA] = 'G'
        emRNA = ''.join(emRNA)

        #Test the protein sequences
        pRNA = cgSeqMod.translateRNA(mRNA, map)
        epRNA = cgSeqMod.translateRNA(emRNA, map)

        #print t.parent, t.id
        newString = ['%s  ' % x for x in list(pRNA)]
        newString = ''.join(newString)

        if pRNA[0] != 'M':
            print 'Non-canonical Start AA:', pRNA[0:5], mRNA[:10]
        if pRNA[-1] != '*':
            print 'Non-canonical End AA:', pRNA[-5:], mRNA[-10:]

        #compare the codons.

        mCodonList = cgSeqMod.getCodonListFromRNA(mRNA)
        emCodonList = cgSeqMod.getCodonListFromRNA(emRNA)
        compareList = zip(mCodonList, emCodonList)
        synFlag = 'SYN'

        codonNumber = ePositionInMRNA // 3

        codonPair = compareList[codonNumber]
        print t.id
        print eSite.ID
        print mCodonList[:codonNumber]
        print mRNA[:ePositionInMRNA]
        bCodon = codonPair[0]
        aCodon = codonPair[1]

        baa = cgSeqMod.translateRNA(bCodon, map)
        aaa = cgSeqMod.translateRNA(aCodon, map)
        if baa != aaa:
            synFlag = 'NON'
            bCodonList = list(bCodon)
            aCodonList = list(aCodon)
            matchedLetters = zip(bCodonList, aCodonList)
            for pair in matchedLetters:
                if pair[0] != 'A':
                    if pair[1] == 'G' and pair[0] != 'G':
                        print 'messed up codon switch', bCodonList, aCodonList
                        print t.parent, '%s:%s' % (
                            eSite.chromosome, eSite.coordinate
                        ), eSite.strand, bCodon, aCodon, baa, aaa
        else:
            synFlag = 'SYN'

        finalDict[t.id] = [synFlag, bCodon, aCodon, baa, aaa]

    print 'writing to file'
    #update line by line
    newLines = []
    f = open(resultsFN, 'r')
    for line in f:
        newLine = line.strip()
        tID = line.strip().split('\t')[2]

        if tID in finalDict:
            newLine = newLine + '\t%s\t%s\t%s\t%s\t%s\n' % (
                finalDict[tID][0], finalDict[tID][1], finalDict[tID][2],
                finalDict[tID][3], finalDict[tID][4])
        else:
            newLine = newLine + '\tNA\tNA\tNA\tNA\tNA\n'

        newLines.append(newLine)
    f.close()

    #update file
    f = open(outFN, 'w')
    f.writelines(newLines)
    f.close()
Пример #18
0
def betterSynonymous(eFN, gFN, contextFN, outFN, refBase='A', eBase='G'):

    print 'loading e sites'
    eSites = cgEdit.loadEditingSites(eFN)
    print 'loading geneSet'
    geneSet = cgGenes3.createGeneSetEditing(gFN)

    contextInfo = {}  # eID: tID : [UTR, C]
    f = open(contextFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        eID = int(ls[0])
        tID = ls[2]
        cInfo = [ls[3], ls[4]]
        if eID not in contextInfo:
            contextInfo[eID] = {}
            contextInfo[eID][tID] = cInfo
        else:
            contextInfo[eID][tID] = cInfo

    eID_tIDs = {}
    f = open(contextFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        eID = int(ls[0])
        tID = ls[2]
        if tID not in eID_tIDs.setdefault(eID, []): eID_tIDs[eID].append(tID)

    eID_eSite = {}
    for eSite in eSites:
        eID_eSite[eSite.ID] = eSite

    tID_transcript = {}
    for transcript in geneSet.transcripts:
        tID_transcript[transcript.id] = transcript

    eSite_transcripts = {}
    for eID in eID_tIDs:
        eSite = eID_eSite[eID]
        tList = []
        for tID in eID_tIDs[eID]:
            if tID == 'NONE': continue
            if tID_transcript.get(tID, None) == None: continue
            tList.append(tID_transcript[tID])
        eSite_transcripts[eSite] = tList

    outF = open(outFN, 'w')
    map = cgSeqMod.loadCodonMap('hg19')
    for eSite in eSite_transcripts:

        for transcript in eSite_transcripts[eSite]:

            siteType, codingType = contextInfo[eSite.ID][transcript.id]
            if '_noncoding' in transcript.tType:
                continue
            if codingType != 'C':
                continue

            ePositionInMRNA = transcript.getRelativePositionMRNA(
                eSite.coordinate - 1)
            mRNA = transcript.getMRNA(coding=True)
            emRNA = transcript.getMRNA(coding=True)

            if mRNA[ePositionInMRNA] != refBase:
                print 'Editing site was not an A...'

            #edit the site
            emRNA = list(emRNA)
            emRNA[ePositionInMRNA] = eBase
            emRNA = ''.join(emRNA)

            #Test the protein sequences
            pRNA = cgSeqMod.translateRNA(mRNA, map)
            epRNA = cgSeqMod.translateRNA(emRNA, map)
            if pRNA[0] != 'M':
                print 'Non-canonical Start AA:', pRNA[0:5], mRNA[:10]
            if pRNA[-1] != '*':
                print 'Non-canonical End AA:', pRNA[-5:], mRNA[-10:]

            #compare the codons.
            mCodonList = cgSeqMod.getCodonListFromRNA(mRNA)
            emCodonList = cgSeqMod.getCodonListFromRNA(emRNA)
            compareList = zip(mCodonList, emCodonList)
            codonNumber = ePositionInMRNA // 3
            codonPair = compareList[codonNumber]
            bCodon = codonPair[0]
            aCodon = codonPair[1]
            baa = cgSeqMod.translateRNA(bCodon, map)
            aaa = cgSeqMod.translateRNA(aCodon, map)
            synFlag = 'SYN'
            if baa != aaa:
                synFlag = 'NON'
                bCodonList = list(bCodon)
                aCodonList = list(aCodon)
                matchedLetters = zip(bCodonList, aCodonList)
                for pair in matchedLetters:
                    if pair[0] != 'A':
                        if pair[1] == 'G' and pair[0] != 'G':
                            print 'messed up codon switch', bCodonList, aCodonList
                            print t.parent, '%s:%s' % (
                                eSite.chromosome, eSite.coordinate
                            ), eSite.strand, bCodon, aCodon, baa, aaa

            outF.write('\t'.join([
                str(eSite.ID), transcript.parent, transcript.id, synFlag,
                bCodon, aCodon, baa, aaa
            ]) + '\n')
Пример #19
0
def doit(fN):
        geneSet = cgGenes3.createGeneSetEditing(fN)

        for transcript in geneSet.transcripts:
                if transcript.id == 'NM_031422':
                        dumpObj.dumpObj(transcript)
Пример #20
0
def makeTargetExpressionHistogram(eFN, targetFN, contextFN, geneFN, eChangeFN):
       

        print 'loading expression ratios'
        gName_eChange = getERatioDict(eChangeFN)

        print 'loading eSites and Transcripts'
        eSites = cgEdit.loadEditingSites(eFN)
        geneSet = cgGenes3.createGeneSetEditing(geneFN)
        
        print 'making joint dicts and loading extra data'
        #joint
        eID_eSite = {}
        for eSite in eSites:
                eID_eSite[eSite.ID] = eSite

        
        #joint
        tID_gName = {}
        for transcript in geneSet.transcripts:
                tID_gName[transcript.id] = transcript.parent


        #load context data
        f = open(contextFN, 'r')
        eID_tID = {} # eID: tID
        tID_tType = {}
        for line in f:
                ls = line.strip().split('\t')
                eID = int(ls[0])
                tID = ls[1]
                tType = ls[2]
                tID_tType[tID] = tType
                if eID in eID_tID:
                        eID_tID[eID].append(tID)
                else:
                        eID_tID[eID] = [tID]
        f.close()
        

        print 'analyzing'
        #Get created or destroyed
        f = open(targetFN, 'r')
        altered = []
        for line in f:
                ls = line.strip().split('\t')
                #created/destroyed
                if (ls[3] != 'None') and (ls[4] == 'None'):
                        altered.append(int(ls[0]))
        f.close()
        
                                       
        print 'number of created/destroyed sites:', len(altered)

        alteredSites = []
        for id in altered:
                alteredSites.append(eID_eSite[id])


        eChanges = []
        gDone = []
        #Get gene names for each eSite
        for eSite in alteredSites:
                genes = []
                for tID in eID_tID[eSite.ID]:
                        if tID == 'NONE':
                                continue

                        gName = tID_gName[tID]
                        if tID_tType[tID] != '3UTR':
                                print 'Not 3UTR', tID
                                continue
                        if gName not in genes:
                                genes.append(gName)

                
                if len(genes) > 1:
                        print 'more than one gene for eSite...', genes
                        continue
                
                if gName in gDone:
                        continue
                else:
                        gDone.append(gName)


                #Now add expression to HistoGram List...
                if gName in gName_eChange:
                        eChange = gName_eChange[gName]
                else:
                        print 'gene not in expression list', gName
                        continue
                eChange = gName_eChange[gName]
                eChange = math.log(eChange, 2)
                eChanges.append(eChange)

     
        #Now plot the histogram
        plt.hist(eChanges, 40)
        plt.xlabel('log2(RPKM KD/ RPKM CONTROL)')
        plt.ylabel('# Genes')
        plt.show()