def shift_sites(self, sites): """ Calculate the list of splice sites on the mutated string, given a list of splice sites on the original string. @arg sites: List of splice sites on the original string. @type sites: list(int) @return: List of splice sites on the mutated string. @rtype: list(int) Example 1 (DNA): NG_012772.1(BRCA2_v001) ...---------[=========]----------... ^ ^ 18964 19013 Variant Expected new location for splice site 18964 g.18963del 18963 g.18964del 18964 g.18963_18964ins 18964 g.18964_18965ins 18964 Variant Expected new location for splice site 19013 g.19013del 19012 g.19014del 19013 g.19013_19014ins 19014 Example 2 (RNA): NM_000088.3 ...============][==============... /\ 229 230 Variant Expected new location for splice sites 229,230 n.228del 228,229 n.229del 228,229 n.230del 229,230 n.231del 229,230 n.228_229ins 230,231 n.229_230ins 229,230 or 230,231 n.230_231ins 229,230 """ # We use shiftpos(i+1)-1 instead of shiftpos(i) (and its mirror) # to make sure insertions directly before or after an exon are # placed inside the exon. # # Example: # # -----SPLICE[======]SPLICE----------SPLICE[=======]SPLICE----- # ^ ^ # ins ins # # These two insertions should be mapped inside the exons because # they are before and after (respectively) their exons and don't # hit the (biological) splice sites. # # This also makes sure deletions of the last exon base are really # removed from the exon. The problem is that positions following # (but not including) the deletion get a shift, but the splice site # is stored by the position of the last exon base. So the splice # site position would not be decremented without the +1-1 dance. new_sites = [] prev_donor = None filtered_sites = [s for s in sites if s not in self._removed_sites] for acceptor, donor in util.grouper(filtered_sites): # We don't want to do the -1+1 dance if # 1) there is a deletion directly before the exon, or # 2) there is another exon directly before this exon, or # 3) this is the first site in the list. # # A consequence of check 2) is that insertions between two # directly adjacent exons are seen as insertions in the first # exon. # # Condition 3) makes sure we don't include insertions directly # in front of CDS start in the CDS. It also affects translation # start, but this should be no problem. if not prev_donor or prev_donor == acceptor - 1 or \ self._shift_minus_at(acceptor): new_sites.append(self.shift(acceptor)) else: new_sites.append(self.shift(acceptor - 1) + 1) # Should never happen since splice sites come in pairs. if not donor: continue # We don't want to do the +1-1 dance if this is the last site # in the list. This makes sure we don't include insertions # directly at CDS end in the CDS. It also affects translation # end, but this should be no problem. if donor == sites[-1]: new_sites.append(self.shift(donor)) else: new_sites.append(self.shift(donor + 1) - 1) prev_donor = donor return new_sites
def checkRecord(self): """ Check if the record in self.record is compatible with mutalyzer. Update the mRNA PList with the exon and CDS data. @todo: This function should really check the record for minimal requirements """ # TODO: This function should really check # the record for minimal requirements. for i in self.record.geneList: """ if len(i.transcriptList) == 2 : if i.transcriptList[0].CDS and not i.transcriptList[1].CDS and \ i.transcriptList[1].mRNA and not i.transcriptList[0].mRNA : i.transcriptList[0].mRNA = i.transcriptList[1].mRNA if i.transcriptList[1].CDS and not i.transcriptList[0].CDS and \ i.transcriptList[0].mRNA and not i.transcriptList[1].mRNA : i.transcriptList[0].CDS = i.transcriptList[1].CDS i.transcriptList = [i.transcriptList[0]] i.transcriptList[0].transcribe = True i.transcriptList[0].translate = True #if """ for j in i.transcriptList: if not j.mRNA: usableExonList = self.__checkExonList(j.exon, j.CDS) if self.record.molType == "n" and j.exon: if not all(p1 + 1 == p2 for p1, p2 in util.grouper(j.exon.positionList[1:-1])): code = "WEXON_ANNOTATION" if j.current else "WEXON_ANNOTATION_OTHER" self.__output.addMessage( __file__, 2, code, "Exons for gene %s, transcript variant %s were " "found not to be adjacent. This signifies a " "possible problem in the annotation of the " "reference sequence." % (i.name, j.name), ) if not j.exon or not usableExonList: if self.record.molType == "g": code = "WNOMRNA" if j.current else "WNOMRNA_OTHER" self.__output.addMessage( __file__, 2, code, "No mRNA field found for gene %s, transcript " "variant %s in record, constructing " "it from CDS. Please note that descriptions " "exceeding CDS boundaries are invalid." % (i.name, j.name), ) if j.exon and j.exon.positionList and not usableExonList: code = "WNOMRNA" if j.current else "WNOMRNA_OTHER" self.__output.addMessage( __file__, 2, code, "Exons were found for gene %s, transcript " "variant %s but were not usable. " "Please note that descriptions " "exceeding CDS boundaries are invalid." % (i.name, j.name), ) if j.CDS: if not j.CDS.positionList: # self.__output.addMessage(__file__, 2, # "WNOCDSLIST", "No CDS list found for " \ # "gene %s, transcript variant %s in " \ # "record, constructing it from " \ # "CDS location." % (i.name, j.name)) j.mRNA = j.CDS j.mRNA.positionList = j.CDS.location # if else: j.mRNA = j.CDS j.linkMethod = "construction" j.transcribe = True j.translate = True # if else: self.__output.addMessage( __file__, 2, "WNOCDS", "No CDS found for gene %s, transcript " "variant %s in record, " "constructing it from gene location." % (i.name, j.name), ) j.CDS = None # PList() # j.CDS.location = i.location j.mRNA = PList() j.mRNA.location = i.location # j.mRNA.positionList = i.location j.molType = "n" # else # if else: # self.__output.addMessage(__file__, 2, "WNOMRNA", # "No mRNA field found for gene %s, transcript " \ # "variant %s in record, constructing " \ # "it from gathered exon information." % ( # i.name, j.name)) j.mRNA = j.exon # else # if # else : # j.transcribe = True if not j.mRNA.positionList: j.mRNA.positionList = j.mRNA.location if j.mRNA.positionList and j.CDS and j.CDS.positionList != None: if not j.CDS.positionList: # self.__output.addMessage(__file__, 2, "WNOCDS", # "No CDS list found for gene %s, transcript " \ # "variant %s in record, constructing " \ # "it from mRNA list and CDS location." % (i.name, # j.name)) if j.mRNA.positionList: j.CDS.positionList = self.__constructCDS(j.mRNA.positionList, j.CDS.location) else: j.CDS.positionList = self.__constructCDS(j.mRNA.location, j.CDS.location) j.transcribe = True j.translate = True # if j.CM = Crossmap.Crossmap(j.mRNA.positionList, j.CDS.location, i.orientation) # if else: j.molType = "n" if j.mRNA.positionList: j.CM = Crossmap.Crossmap(j.mRNA.positionList, [], i.orientation) j.transcribe = True else: j.description = "?"
def checkRecord(self): """ Check if the record in self.record is compatible with mutalyzer. Update the mRNA PList with the exon and CDS data. @todo: This function should really check the record for minimal requirements """ #TODO: This function should really check # the record for minimal requirements. for i in self.record.geneList: """ if len(i.transcriptList) == 2 : if i.transcriptList[0].CDS and not i.transcriptList[1].CDS and \ i.transcriptList[1].mRNA and not i.transcriptList[0].mRNA : i.transcriptList[0].mRNA = i.transcriptList[1].mRNA if i.transcriptList[1].CDS and not i.transcriptList[0].CDS and \ i.transcriptList[0].mRNA and not i.transcriptList[1].mRNA : i.transcriptList[0].CDS = i.transcriptList[1].CDS i.transcriptList = [i.transcriptList[0]] i.transcriptList[0].transcribe = True i.transcriptList[0].translate = True #if """ for j in i.transcriptList: if not j.mRNA: usableExonList = self.__checkExonList(j.exon, j.CDS) if self.record.molType == 'n' and j.exon: if not all(p1 + 1 == p2 for p1, p2 in util.grouper( j.exon.positionList[1:-1])): code = 'WEXON_ANNOTATION' if j.current else 'WEXON_ANNOTATION_OTHER' self.__output.addMessage( __file__, 2, code, "Exons for gene %s, transcript variant %s were " "found not to be adjacent. This signifies a " "possible problem in the annotation of the " "reference sequence." % (i.name, j.name)) if not j.exon or not usableExonList: if self.record.molType == 'g': code = 'WNOMRNA' if j.current else 'WNOMRNA_OTHER' self.__output.addMessage(__file__, 2, code, "No mRNA field found for gene %s, transcript " \ "variant %s in record, constructing " \ "it from CDS. Please note that descriptions "\ "exceeding CDS boundaries are invalid." % ( i.name, j.name)) if j.exon and j.exon.positionList and \ not usableExonList : code = 'WNOMRNA' if j.current else 'WNOMRNA_OTHER' self.__output.addMessage(__file__, 2, code, "Exons were found for gene %s, transcript " \ "variant %s but were not usable. " \ "Please note that descriptions "\ "exceeding CDS boundaries are invalid." % ( i.name, j.name)) if j.CDS: if not j.CDS.positionList: #self.__output.addMessage(__file__, 2, # "WNOCDSLIST", "No CDS list found for " \ # "gene %s, transcript variant %s in " \ # "record, constructing it from " \ # "CDS location." % (i.name, j.name)) j.mRNA = j.CDS j.mRNA.positionList = j.CDS.location #if else: j.mRNA = j.CDS j.linkMethod = "construction" j.transcribe = True j.translate = True #if else: self.__output.addMessage(__file__, 2, "WNOCDS", "No CDS found for gene %s, transcript " \ "variant %s in record, " \ "constructing it from gene location." % ( i.name, j.name)) j.CDS = None #PList() #j.CDS.location = i.location j.mRNA = PList() j.mRNA.location = i.location #j.mRNA.positionList = i.location j.molType = 'n' #else #if else: #self.__output.addMessage(__file__, 2, "WNOMRNA", # "No mRNA field found for gene %s, transcript " \ # "variant %s in record, constructing " \ # "it from gathered exon information." % ( # i.name, j.name)) j.mRNA = j.exon #else #if #else : # j.transcribe = True if not j.mRNA.positionList: j.mRNA.positionList = j.mRNA.location if j.mRNA.positionList and j.CDS and j.CDS.positionList != None: if not j.CDS.positionList: #self.__output.addMessage(__file__, 2, "WNOCDS", # "No CDS list found for gene %s, transcript " \ # "variant %s in record, constructing " \ # "it from mRNA list and CDS location." % (i.name, # j.name)) if j.mRNA.positionList: j.CDS.positionList = self.__constructCDS( j.mRNA.positionList, j.CDS.location) else: j.CDS.positionList = self.__constructCDS( j.mRNA.location, j.CDS.location) j.transcribe = True j.translate = True #if j.CM = Crossmap.Crossmap(j.mRNA.positionList, j.CDS.location, i.orientation) #if else: j.molType = 'n' if j.mRNA.positionList: j.CM = Crossmap.Crossmap(j.mRNA.positionList, [], i.orientation) j.transcribe = True else: j.description = '?'