Пример #1
0
 def loadGFF_UTR(self,fields,line,transcriptBeginEnd,GFF,
                        transcripts,readOrder,genes):
     exonBegin=int(fields[3])-1
     exonEnd=int(fields[4])
     exonScore=fields[5]
     strand=fields[6]
     frame=fields[7]
     transcriptId=None
     rex=Rex()
     if(rex.find('transgrp[:=]\s*(\S+)',line)): transcriptId=rex[1]
     elif(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)):
         transcriptId=rex[1]
     elif(rex.find('Parent=([^;,\s]+)',line)): transcriptId=rex[1]
     geneId=None
     if(rex.find('genegrp=(\S+)',line)): geneId=rex[1]
     elif(rex.find('gene_id[:=]?\s*"?([^\s\;"]+)"?',line)): geneId=rex[1]
     if(transcriptId is None): transcriptId=geneId
     if(geneId is None): geneId=transcriptId
     if(transcriptId is None): 
         raise Exception(line+" : no transcript ID found")        
     if(rex.find("(\S+);$",transcriptId)): transcriptId=rex[1]
     if(rex.find("(\S+);$",geneId)): geneId=rex[1]
     extra=""
     for i in range(8,len(fields)): extra+=fields[i]+" "
     if(exonBegin>exonEnd): (exonBegin,exonEnd)=(exonEnd,exonBegin)
     transcript=transcripts.get(transcriptId,None)
     if(not transcript):
         transcripts[transcriptId]=transcript= \
             Transcript(transcriptId,strand)
         transcript.setStopCodons(self.stopCodons)
         transcript.readOrder=readOrder
         readOrder+=1
         transcript.substrate=fields[0]
         transcript.source=fields[1]
         if(transcriptBeginEnd.get(transcriptId,None) is not None):
             (begin,end)=transcriptBeginEnd[transcriptId]
             transcript.setBegin(begin)
             transcript.setEnd(end)
         else:
             transcript.setBegin(exonBegin)
             transcript.setEnd(exonEnd)
     transcript.geneId=geneId
     gene=genes.get(geneId,None)
     if(gene is None):
         genes[geneId]=gene=Gene(); gene.setId(geneId)
     transcript.setGene(gene)
     exon=Exon(exonBegin,exonEnd,transcript)
     exon.extraFields=extra
     if(transcript.rawExons is not None): 
         exon.frame=frame
         exon.score=exonScore
         exon.type=fields[2]
         transcript.rawExons.append(exon)
     elif(not transcript.exonOverlapsExon(exon)):
         exon.frame=frame
         exon.score=exonScore
         exon.type=fields[2]
         transcript.UTR.append(exon) # OK -- we sort later
     gene.addTranscript(transcript)
Пример #2
0
    def loadGFF_CDS(self,fields,line,transcriptBeginEnd,GFF,
                    transcripts,readOrder,genes):
        exonBegin=int(fields[3])-1
        exonEnd=int(fields[4])
        exonScore=fields[5]
        strand=fields[6]
        frame=fields[7]
        transcriptId=None
        rex=Rex()
        if(rex.find('transgrp[:=]\s*(\S+)',line)): transcriptId=rex[1]
        elif(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)):
            transcriptId=rex[1]
        elif(rex.find('Parent=([^;,\s]+)',line)): transcriptId=rex[1]
        geneId=None
        if(rex.find('genegrp=(\S+)',line)): geneId=rex[1]
        elif(rex.find('gene_id[:=]?\s*"?([^\s\;"]+)"?',line)): geneId=rex[1]
        if(transcriptId is None): transcriptId=geneId
        if(geneId is None): geneId=transcriptId
        if(transcriptId is None): 
            raise Exception(line+" : no transcript ID found")        
        if(rex.find('(\S+);$',transcriptId)): transcriptId=rex[1]
        if(rex.find('(\S+);$',geneId)): geneId=rex[1]
        extra=""
        for i in range(8,len(fields)): extra+=fields[i]+" "
        if(exonBegin>exonEnd): (exonBegin,exonEnd)=(exonEnd,exonBegin)
        transcript=transcripts.get(transcriptId,None)
        if(transcript is None):
            transcripts[transcriptId]=transcript= \
	                               Transcript(transcriptId,strand)
            transcript.setStopCodons(self.stopCodons)
            transcript.readOrder=readOrder
            readOrder+=1
            transcript.substrate=fields[0]
            transcript.source=fields[1]
            if(transcriptBeginEnd.get(transcriptId,None) is not None):
                (begin,end)=transcriptBeginEnd[transcriptId]
                transcript.setBegin(begin)
                transcript.setEnd(end)
        transcript.geneId=geneId
        gene=genes.get(geneId,None)
        if(gene is None):
            genes[geneId]=gene=Gene(); gene.setId(geneId)
        transcript.setGene(gene)
        exon=Exon(exonBegin,exonEnd,transcript)
        exon.extraFields=extra
        if(not transcript.exonOverlapsExon(exon)):
            exon.frame=frame
            exon.score=exonScore
            exon.type=fields[2]
            transcript.exons.append(exon) # OK -- we sort later
        gene.addTranscript(transcript)
Пример #3
0
 def __init__(self,id,strand=None):
     if(type(id)!=EssexNode): # not an EssexNode
         self.transcriptId=id
         self.strand=strand
         self.exons=[]
         self.UTR=[]
         self.rawExons=None
         self.stopCodons={"TAG":1,"TGA":1,"TAA":1}
         self.startCodon=None
         self.extraFields=None
     else: # EssexNode
         essex=id
         self.transcriptId=essex.getAttribute("ID")
         self.strand=essex.getAttribute("strand")
         self.source=essex.getAttribute("source")
         self.begin=essex.getAttribute("begin")
         self.end=essex.getAttribute("end")
         self.geneId=essex.getAttribute("gene")
         self.substrate=essex.getAttribute("substrate")
         self.exons=[]
         self.UTR=[]
         self.rawExons=None
         self.startCodon=None
         self.extraFields=None
         self.stopCodons={"TAG":1,"TGA":1,"TAA":1}
         exons=self.exons
         UTR=self.UTR
         exonsElem=essex.findChild("exons")
         if(exonsElem):
             n=exonsElem.numElements()
             for i in range(0,n):
                 exon=exonsElem.getIthElem(i)
                 begin=int(exon.getIthElem(0))
                 end=int(exon.getIthElem(1))
                 exon=Exon(begin,end,self)
                 exons.append(exon)
         utrElem=essex.findChild("UTR")
         if(utrElem):
             n=utrElem.numElements()
             for i in range(0,n):
                 exon=utrElem.getIthElem(i)
                 begin=int(exon.getIthElem(0))
                 end=int(exon.getIthElem(1))
                 exon=Exon(begin,end,self)
                 UTR.append(exon)
Пример #4
0
 def makeExon(self, root):
     begin = int(root["begin"])
     end = int(root["end"])
     exon = Exon(begin, end, None)
     exon.strand = root["strand"]
     exon.frame = root["frame"]
     exon.type = root["type"]
     exon.score = root["score"]
     exon.substrate = root["substrate"]
     extra = root["extra"]
     exon.extraFields = ""
     for key in extra:
         exon.extraFields += key + "=" + extra[key] + ";"
     return exon
Пример #5
0
 def makeExon(self,root):
     begin=int(root["begin"])
     end=int(root["end"])
     exon=Exon(begin,end,None)
     exon.strand=root["strand"]
     exon.frame=root["frame"]
     exon.type=root["type"]
     exon.score=root["score"]
     exon.substrate=root["substrate"]
     extra=root["extra"]
     exon.extraFields=""
     for key in extra:
         exon.extraFields+=key+"="+extra[key]+";"
     return exon
Пример #6
0
    def organize_features( self, isoform, bool_exon = True ):
        """
        Args:
            isoform = cruzdb object that contains information for a specific isoform
        Function: this will organize the feats, cds, & reading frame for a specific gene
        """
        #hash_feats = will contain feats sorted by start position (key = index, value = feat)
        hash_feats = {}       #key = string that is feat range (chrom:start-end), value = feat object

        #get reading frames
        

        ##TEST:: print "organize_feats: isoform = ", isoform.feats
        if bool_exon:
            list_features = isoform.exons
            feat_frames = map( int, [x for x in isoform.exonFrames.split(',') if x] )
        else:
            list_features = isoform.introns

        for i, feat in enumerate( list_features ):      #i = feat number
            #check to see if the key_range exists 
            key_range = Isoform.make_key_range( self.chrom, feat[0], feat[1] )

            #calculate the feat number
            # feat_num = i if self.strand == 1 else ( len(list_features) - i - 1 )        #for 0-based exons, use this
            # feat_num = i + 1 if self.strand == 1 else ( len(list_features) - i )        #for 1-based exons, use this
            feat_num = i + exon_base if self.strand == 1 else ( len(list_features) - i - ( exon_base - 1 ) )        #this handles both 0-based & 1-based

            #NOTE: UCSC has 0-based genome, meaning the first position of the feat is actually the last position in the previous intron, that is why I add '+1'
            if bool_exon:
                feat_info = self.get_feat_info( feat[0], feat[1], feat_frames[i], feat_num, True, 'exon' )
            else:
                feat_info = self.get_feat_info( feat[0], feat[1], None, feat_num, True, 'intron' )
            hash_feats[ key_range ] = Exon( feat_info )

        #go through all cds, assign CDS to each feat
        if bool_exon:
            for each_cds in isoform.cds:     #each_cd = tuple where [0] = start position & [1] = end position
                hash_feats = self.organize_exons_cds( each_cds[0], each_cds[1], hash_feats )

        return hash_feats