def __repr__(self): if self.name is None: return "ChromosomeInterval('{}', {}, {}, '{}')".format(self.chromosome, self.start, self.stop, convert_strand(self.strand)) else: return "ChromosomeInterval('{}', {}, {}, '{}', '{}')".format(self.chromosome, self.start, self.stop, convert_strand(self.strand), self.name)
def __init__(self, gene_pred_tokens): # Text genePred fields self.name = gene_pred_tokens[0] self.chromosome = gene_pred_tokens[1] self.strand = convert_strand(gene_pred_tokens[2]) # Integer genePred fields self.score = 0 # no score in genePred files self.thick_start = int(gene_pred_tokens[5]) self.thick_stop = int(gene_pred_tokens[6]) self.start = int(gene_pred_tokens[3]) self.stop = int(gene_pred_tokens[4]) self.rgb = "128,0,0" # no RGB in genePred files # genePred specific fields self.id = gene_pred_tokens[10] self.name2 = gene_pred_tokens[11] self.cds_start_stat = gene_pred_tokens[12] self.cds_end_stat = gene_pred_tokens[13] self.exon_frames = [int(x) for x in gene_pred_tokens[14].split(",") if x != ""] # convert genePred format coordinates to BED-like coordinates to make intervals self.block_count = gene_pred_tokens[7] block_starts = [int(x) for x in gene_pred_tokens[8].split(",") if x != ""] block_ends = [int(x) for x in gene_pred_tokens[9].split(",") if x != ""] self.block_sizes = ",".join(map(str, [e - s for e, s in izip(block_ends, block_starts)])) self.block_starts = ",".join(map(str, [x - self.start for x in block_starts])) bed_tokens = [gene_pred_tokens[1], self.start, self.stop, self.name, self.score, gene_pred_tokens[2], self.thick_start, self.thick_stop, self.rgb, self.block_count, self.block_sizes, self.block_starts] # build chromosome intervals for exons and introns self.exon_intervals = self._get_exon_intervals(bed_tokens) self.intron_intervals = self._get_intron_intervals() # build Exons mapping transcript space coordinates to chromosome self.exons = self._get_exons(bed_tokens) # calculate sizes self._get_cds_size() self._get_size()
def __init__(self, chromosome, start, stop, strand, name=None): self.chromosome = str(chromosome) assert start <= stop self.start = int(start) # 0 based self.stop = int(stop) # exclusive if strand not in [True, False, None]: strand = convert_strand(strand) self.strand = strand # True or False self.name = name
def chromosome_region_to_bed(t, start, stop, rgb, name): """ This is different from chromosome_coordinate_to_bed - this function will not resize the BED information for the input transcript, but instead be any coordinate on the chromosome. """ strand = convert_strand(t.strand) chrom = t.chromosome assert start is not None and stop is not None, (t.name, start, stop, name) assert stop >= start, (t.name, start, stop, name) return [chrom, start, stop, name + "/" + t.name, 0, strand, start, stop, rgb, 1, stop - start, 0]
def splice_intron_interval_to_bed(t, intron_interval, rgb, name): """ Specific case of turning an intron interval into the first and last two bases (splice sites) """ interval = intron_interval assert interval.stop >= interval.start, (t.name, t.chromosome) assert interval.stop - interval.start - 2 > 2, (t.name, t.chromosome) block_starts = "0,{}".format(interval.stop - interval.start - 2) return [interval.chromosome, interval.start, interval.stop, "/".join([name, t.name]), 0, convert_strand(interval.strand), interval.start, interval.stop, rgb, 2, "2,2", block_starts]
def _get_exon_intervals(self, bed_tokens): """ Gets a list of exon intervals in chromosome coordinate space. These exons are on (+) strand ordering regardless of transcript strand. This means (-) strand genes will be represented backwards """ exons = [] start, stop = int(bed_tokens[1]), int(bed_tokens[2]) chrom, strand = bed_tokens[0], convert_strand(bed_tokens[5]) block_sizes = [int(x) for x in bed_tokens[10].split(",") if x != ""] block_starts = [int(x) for x in bed_tokens[11].split(",") if x != ""] for block_size, block_start in izip(block_sizes, block_starts): exons.append(ChromosomeInterval(chrom, start + block_start, start + block_start + block_size, strand)) return exons
def __init__(self, bed_tokens): self.chromosome = bed_tokens[0] self.start = int(bed_tokens[1]) self.stop = int(bed_tokens[2]) self.name = bed_tokens[3] self.score = int(bed_tokens[4]) self.strand = convert_strand(bed_tokens[5]) self.thick_start = int(bed_tokens[6]) self.thick_stop = int(bed_tokens[7]) self.rgb = bed_tokens[8] self.block_count = bed_tokens[9] self.block_sizes = bed_tokens[10] self.block_starts = bed_tokens[11] # build chromosome intervals for exons and introns self.exon_intervals = self._get_exon_intervals(bed_tokens) self.intron_intervals = self._get_intron_intervals() # build Exons mapping transcript space coordinates to chromosome self.exons = self._get_exons(bed_tokens) # calculate sizes self._get_cds_size() self._get_size()
def get_bed(self, rgb, name): """ Returns BED tokens representing this interval. Requires a name and a rgb value. BED is BED12. """ return [self.chromosome, self.start, self.stop, name, 0, convert_strand(self.strand), self.start, self.stop, rgb, 1, len(self), 0]
def interval_to_bed(t, interval, rgb, name): """ If you are turning interval objects into BED records, look here. t is a transcript object. Interval objects should always have start <= stop (+ strand chromosome ordering) """ assert interval.stop >= interval.start, (t.name, t.chromosome) return [interval.chromosome, interval.start, interval.stop, name + "/" + t.name, 0, convert_strand(interval.strand), interval.start, interval.stop, rgb, 1, interval.stop - interval.start, 0]
def get_bed(self, rgb=None, name=None, start_offset=None, stop_offset=None): """ Returns this transcript as a BED record with optional changes to rgb and name. If start_offset or stop_offset are set (chromosome coordinates), then this record will be changed to only show results within that region, which is defined in chromosome coordinates. """ if start_offset is not None and stop_offset is not None: assert start_offset <= stop_offset if start_offset is not None: assert start_offset >= self.start if stop_offset is not None: assert stop_offset <= self.stop if rgb is None: rgb = self.rgb if name is not None: name += "/" + self.name else: name = self.name if start_offset is None and stop_offset is None: return [self.chromosome, self.start, self.stop, name, self.score, convert_strand(self.strand), self.thick_start, self.thick_stop, rgb, self.block_count, self.block_sizes, self.block_starts] elif start_offset == stop_offset: assert self.chromosome_coordinate_to_transcript(start_offset) is not None # no intron records return [self.chromosome, start_offset, stop_offset, name, self.score, convert_strand(self.strand), start_offset, stop_offset, rgb, 1, 0, 0] def _move_start(exon_intervals, block_count, block_starts, block_sizes, start, start_offset): to_remove = len([x for x in exon_intervals if x.start <= start_offset and x.stop <= start_offset]) assert to_remove < len(exon_intervals) if to_remove > 0: block_count -= to_remove block_sizes = block_sizes[to_remove:] start += block_starts[to_remove] new_block_starts = [0] for i in xrange(to_remove, len(block_starts) - 1): new_block_starts.append(block_starts[i + 1] - block_starts[i] + new_block_starts[-1]) block_starts = new_block_starts if start_offset > start: block_sizes[0] += start - start_offset block_starts[1:] = [x + start - start_offset for x in block_starts[1:]] start = start_offset return start, block_count, block_starts, block_sizes def _move_stop(exon_intervals, block_count, block_starts, block_sizes, stop, start, stop_offset): to_remove = len([x for x in exon_intervals if x.stop >= stop_offset and x.start >= stop_offset]) assert to_remove < len(exon_intervals) if to_remove > 0: block_count -= to_remove block_sizes = block_sizes[:-to_remove] block_starts = block_starts[:-to_remove] assert len(block_sizes) == len(block_starts) if len(block_sizes) == 0: block_sizes = block_starts = [0] block_count = 1 stop = start + block_sizes[-1] + block_starts[-1] if start + block_starts[-1] < stop_offset < stop: block_sizes[-1] = stop_offset - start - block_starts[-1] stop = stop_offset return stop, block_count, block_starts, block_sizes block_count = int(self.block_count) block_starts = map(int, self.block_starts.split(",")) block_sizes = map(int, self.block_sizes.split(",")) start = self.start stop = self.stop thick_start = self.thick_start thick_stop = self.thick_stop if start_offset is not None and start_offset > start: start, block_count, block_starts, block_sizes = _move_start(self.exon_intervals, block_count, block_starts, block_sizes, start, start_offset) if stop_offset is not None and stop_offset < stop: stop, block_count, block_starts, block_sizes = _move_stop(self.exon_intervals, block_count, block_starts, block_sizes, stop, start, stop_offset) if start > thick_start: thick_start = start if stop < thick_stop: thick_stop = stop if (start > thick_stop and stop > thick_stop) or (start < thick_start and stop < thick_start): thick_start = 0 thick_stop = 0 block_starts = ",".join(map(str, block_starts)) block_sizes = ",".join(map(str, block_sizes)) return [self.chromosome, start, stop, name, self.score, convert_strand(self.strand), thick_start, thick_stop, rgb, block_count, block_sizes, block_starts]
def get_interval(self): """ Returns a ChromosomeInterval object representing the full span of this transcript. """ return ChromosomeInterval(self.chromosome, self.start, self.stop, convert_strand(self.strand))
def _get_exons(self, bed_tokens): """ Get a list of Exons representing the exons in transcript coordinate space. This is in transcript order. See the Exon class for more. """ exons = [] chrom_start, chrom_stop = int(bed_tokens[1]), int(bed_tokens[2]) thick_start, thick_stop = int(bed_tokens[6]), int(bed_tokens[7]) if thick_start == thick_stop: thick_start = thick_stop = 0 chrom, strand = bed_tokens[0], convert_strand(bed_tokens[5]) block_count = int(bed_tokens[9]) block_sizes = [int(x) for x in bed_tokens[10].split(",") if x != ""] block_starts = [int(x) for x in bed_tokens[11].split(",") if x != ""] ################################################################## # HERE BE DRAGONS # this is seriously ugly code to maintain proper mapping # between coordinate spaces. See the unit tests. ################################################################## if strand is False: block_sizes = reversed(block_sizes) block_starts = reversed(block_starts) t_pos, cds_pos = 0, None for block_size, block_start in izip(block_sizes, block_starts): # calculate transcript relative coordinates this_start = t_pos this_stop = t_pos + block_size # calculate chromosome relative coordinates this_chrom_start = chrom_start + block_start this_chrom_stop = chrom_start + block_start + block_size # calculate transcript-relative CDS positions # cds_pos is pos of first coding base in CDS coordinates this_cds_start, this_cds_stop, this_cds_pos = None, None, None if strand is True: # special case - single exon if block_count == 1: this_cds_pos = 0 this_cds_start = thick_start - this_chrom_start this_cds_stop = thick_stop - this_chrom_start # special case - entirely non-coding elif thick_start == thick_stop == 0: this_cds_start, this_cds_stop, this_cds_pos = None, None, None # special case - CDS starts and stops on the same exon elif (this_chrom_start <= thick_start < this_chrom_stop and this_chrom_start < thick_stop <= this_chrom_stop): this_cds_pos = 0 cds_pos = this_chrom_stop - thick_start this_cds_start = this_start + thick_start - this_chrom_start this_cds_stop = this_stop + thick_stop - this_chrom_stop # is this the start codon containing exon? elif this_chrom_start <= thick_start < this_chrom_stop: cds_pos = this_chrom_stop - thick_start this_cds_pos = 0 this_cds_start = this_start + thick_start - this_chrom_start # is this the stop codon containing exon? elif this_chrom_start < thick_stop <= this_chrom_stop: this_cds_pos = cds_pos cds_pos += thick_stop - this_chrom_start this_cds_stop = this_stop + thick_stop - this_chrom_stop # is this exon all coding? elif (this_cds_stop is None and this_cds_start is None and thick_stop >= this_chrom_stop and thick_start < this_chrom_start): this_cds_pos = cds_pos cds_pos += block_size else: # special case - single exon if block_count == 1: this_cds_pos = 0 this_cds_start = this_chrom_stop - thick_stop this_cds_stop = thick_stop - this_chrom_start + this_cds_start # special case - entirely non-coding elif thick_start == thick_stop == 0: this_cds_start, this_cds_stop, this_cds_pos = None, None, None # special case - start and stop codons are on the same exon elif (this_chrom_start < thick_stop <= this_chrom_stop and this_chrom_start <= thick_start < this_chrom_stop): cds_pos = thick_stop - this_chrom_start this_cds_pos = 0 this_cds_start = this_start + this_chrom_stop - thick_stop this_cds_stop = this_start + this_chrom_stop - thick_start # is this the start codon containing exon? elif this_chrom_start < thick_stop <= this_chrom_stop: cds_pos = thick_stop - this_chrom_start this_cds_pos = 0 this_cds_start = this_start + this_chrom_stop - thick_stop # is this the stop codon containing exon? elif this_chrom_start <= thick_start < this_chrom_stop: this_cds_pos = cds_pos this_cds_stop = this_start + this_chrom_stop - thick_start # is this exon all coding? elif (this_cds_stop is None and this_cds_start is None and thick_stop >= this_chrom_stop and thick_start < this_chrom_start): this_cds_pos = cds_pos cds_pos += block_size exons.append(Exon(this_start, this_stop, strand, this_chrom_start, this_chrom_stop, this_cds_start, this_cds_stop, this_cds_pos)) t_pos += block_size return exons