def LoadExonBoundaryCoordinatesFromFile(self, path): #{ DebugMsg(self, "Gene annotation file: %s" % path) # open the annotations file annotations_file = GeneAnnotationParserCls(path, log_info=self.log_info) skipped_chroms = set() # get the coordinates from the file for transcript in annotations_file: #{ # fix chromosome names, if needed chrom = NormalizeChrID(transcript.chrom) if (NonStandardChr(chrom)): #{ ExtremeDebugMsg(self, "Skipping transcript in strange chromosome: " "%s (%s)" % (chrom, transcript.chrom)) skipped_chroms.add(chrom) continue #} end if prev_exon = None for (index, exon) in enumerate(transcript.SortedExons()): #{ (exon.left, exon.right) = (exon.min, exon.max) # assume that exon list is sorted by left coordinate if (None != prev_exon and prev_exon.min > exon.min): #{ raise ExonBoundCounterError("Transcript %s exons are not in " "order: %s, %s" % (transcript.transcript_id, prev_exon.ToString(), exon.ToString())) #} end if prev_exon = exon # do not include the left side of the first exon if (0 == index): #{ exon.left = None #} end if # do not include the right side of the last exon if (len(transcript.exons) == (index+1)): #{ exon.right = None #} end if # exon_bound_coords[chrom][prime_side][coord1][coord2] = gene_list for side in SIDES: #{ if (None != getattr(exon, side)): #{ keys = [chrom, side, getattr(exon, side), getattr(exon, OtherSide(side))] AddToMultiDict(self.exon_bound_coords, keys, transcript.transcript_id) #} end if #} end for #} end for #} end for if (0 < len(skipped_chroms)): #{ DebugMsg(self, "Skipped transcripts in chromosomes: %s" % ", ".join(sorted(skipped_chroms))) #} end if # close the file annotations_file.close()
def __init__(self, path, log_info=None): #{ self.log_info = log_info # open the annotations file self.parser = GeneAnnotationParserCls(path, log_info=log_info)
class AnnotationsFileCls: #{ def __init__(self, path, log_info=None): #{ self.log_info = log_info # open the annotations file self.parser = GeneAnnotationParserCls(path, log_info=log_info) #} end def def __iter__(self): #{ return self #} end def def next(self): #{ # replace spaces in the alias and # get rid of any "chr" in the chromosome name transcript = FixAnnotation(self.parser.next(), use_chr=False) ExtremeDebugMsg(self, "T: %s" % transcript) # ensure that the transcript is from a "normal" chromosome, # not including mitochondrial DNA, and is not a tRNA or rRNA while (NonStandardChr(transcript.chrom) or "M" == transcript.chrom or transcript.gene_name.lower().startswith("trna_") or transcript.gene_name.lower().endswith("_rrna")): #{ ExtremeDebugMsg(self, " Skipping...") transcript = FixAnnotation(self.parser.next(), use_chr=False) ExtremeDebugMsg(self, "T: %s" % transcript) #} end while transcript.isoform = 1 # check whether the transcript is coding or non-coding if (transcript.cdsStart >= transcript.cdsEnd): #{ transcript.non_coding = True #} end if # separate the exons into UTRs and coding exons self.SeparateUTRs(transcript) # reverse the order of the exons if # the transcript is on the negative strand if ("-" == transcript.strand): #{ transcript.exons.reverse() transcript.split_exons.reverse() transcript.utr_flags.reverse() #} end if return transcript #} end def def SeparateUTRs(self, transcript): #{ # ensure that exons are ordered by start coordinate if (transcript.exons[0][0] > transcript.exons[-1][0]): #{ transcript.exons.reverse() #} end if transcript.num_coding_exons = 0 transcript.utr_flags = list() transcript.split_exons = list() if (transcript.non_coding): #{ ExtremeDebugMsg(self, "Not separating UTRs for non-coding gene") return #} end if ExtremeDebugMsg(self, "Separating UTRs from coding exons...\n" "cdsStart: %i, cdsEnd: %i" % (transcript.cdsStart, transcript.cdsEnd)) for (e_start, e_end) in transcript.exons: #{ ExtremeDebugMsg(self, "Exon start: %i, end: %i" % (e_start, e_end)) # if the exon ends before the CDS start or # the exon starts after the CDS end, # the full exon is a UTR if (e_end < transcript.cdsStart or transcript.cdsEnd < e_start): #{ transcript.utr_flags.append(True) transcript.split_exons.append([e_start, e_end]) ExtremeDebugMsg(self, " full UTR") else: # if the exon starts before the CDS start and # ends after the CDS start, # the first part of the exon is a UTR if (e_start < transcript.cdsStart): #{ transcript.utr_flags.append(True) transcript.split_exons.append([e_start, transcript.cdsStart-1]) e_start = transcript.cdsStart ExtremeDebugMsg(self, " UTR start: %i-%i\n New start: %i" % (transcript.split_exons[-1][0], transcript.split_exons[-1][1], e_start)) #} end if # if the exon starts before the CDS end and # ends after the CDS end, # the second part of the exon is a UTR if (transcript.cdsEnd < e_end): #{ transcript.num_coding_exons += 1 transcript.utr_flags.append(False) transcript.split_exons.append([e_start, transcript.cdsEnd]) transcript.utr_flags.append(True) transcript.split_exons.append([transcript.cdsEnd+1, e_end]) ExtremeDebugMsg(self, " exon start: %i-%i\n UTR end: %i-%i" % (transcript.split_exons[-2][0], transcript.split_exons[-2][1], transcript.split_exons[-1][0], transcript.split_exons[-1][1])) # if the exon starts after the CDS start and # ends before the CDS end, # the full exon is really an exon elif (e_start <= e_end): transcript.num_coding_exons += 1 transcript.utr_flags.append(False) transcript.split_exons.append([e_start, e_end]) ExtremeDebugMsg(self, " full exon: %i-%i" % (e_start, e_end)) else: raise ExonCoordsError("cannot determine exon type: " "%s: CDS:%i-%i, Exon:%i-%i" % (transcript.alias, transcript.cdsStart, transcript.cdsEnd, e_start, e_end)) #} end if #} end if #} end for if (len(transcript.split_exons) != len(transcript.utr_flags)): #{ raise ChimeraSimulatorError("error loading transcript: # exons (%i)" % len(transcript.exons) + " not equal to # UTR flags (%i)" % len(transcript.utr_flags))