def _map(self, from_pos, to_pos, pos, end, strict_bounds): """Map position between aligned segments Positions in this function are 0-based, base-counting. """ if strict_bounds and (pos < 0 or pos > from_pos[-1]): raise HGVSInvalidIntervalError("Position is beyond the bounds of transcript record") # find aligned segment to use as basis for mapping # okay for pos to be before first element or after last for pos_i in range(len(self.cigar_op)): if pos < from_pos[pos_i+1]: break if self.cigar_op[pos_i] in "=MX": mapped_pos = to_pos[pos_i] + (pos - from_pos[pos_i]) mapped_pos_offset = 0 elif self.cigar_op[pos_i] in "DI": mapped_pos = to_pos[pos_i] if end == "start": mapped_pos -= 1 mapped_pos_offset = 0 elif self.cigar_op[pos_i] == "N": if pos - from_pos[pos_i] + 1 <= from_pos[pos_i + 1] - pos: mapped_pos = to_pos[pos_i] - 1 mapped_pos_offset = pos - from_pos[pos_i] + 1 else: mapped_pos = to_pos[pos_i] mapped_pos_offset = -(from_pos[pos_i + 1] - pos) return mapped_pos, mapped_pos_offset, self.cigar_op[pos_i]
def _map(self, from_pos, to_pos, pos, base): """Map position between aligned sequences Positions in this function are 0-based. """ pos_i = -1 while pos_i < len(self.cigar_op) and pos >= from_pos[pos_i + 1]: pos_i += 1 if pos_i == -1 or pos_i == len(self.cigar_op): raise HGVSInvalidIntervalError( "Position is beyond the bounds of transcript record") if self.cigar_op[pos_i] in "=MX": mapped_pos = to_pos[pos_i] + (pos - from_pos[pos_i]) mapped_pos_offset = 0 elif self.cigar_op[pos_i] in "DI": if base == "start": mapped_pos = to_pos[pos_i] - 1 elif base == "end": mapped_pos = to_pos[pos_i] mapped_pos_offset = 0 elif self.cigar_op[pos_i] == "N": if pos - from_pos[pos_i] + 1 <= from_pos[pos_i + 1] - pos: mapped_pos = to_pos[pos_i] - 1 mapped_pos_offset = pos - from_pos[pos_i] + 1 else: mapped_pos = to_pos[pos_i] mapped_pos_offset = -(from_pos[pos_i + 1] - pos) return mapped_pos, mapped_pos_offset, self.cigar_op[pos_i]
def _map(from_ivs, to_ivs, from_start_i, from_end_i, max_extent): def iv_map(from_ivs, to_ivs, from_start_i, from_end_i, max_extent): """returns the <start,end> intervals indexes in which from_start_i and from_end_i occur""" # first look for 0-width interval that matches seil = [i for i, iv in enumerate(from_ivs) if iv.start_i == from_start_i and iv.end_i == from_end_i] if len(seil) > 0: si = ei = seil[0] else: sil = [i for i, iv in enumerate(from_ivs) if iv.start_i <= from_start_i <= iv.end_i] eil = [i for i, iv in enumerate(from_ivs) if iv.start_i <= from_end_i <= iv.end_i] if len(sil) == 0 or len(eil) == 0: raise HGVSInvalidIntervalError("start or end or both are beyond the bounds of transcript record") si, ei = (sil[0], eil[-1]) if max_extent else (sil[-1], eil[0]) return si, ei def clip_to_iv(iv, pos): return max(iv.start_i, min(iv.end_i, pos)) assert from_start_i <= from_end_i, "expected from_start_i <= from_end_i" try: si, ei = iv_map(from_ivs, to_ivs, from_start_i, from_end_i, max_extent) except ValueError: raise HGVSInvalidIntervalError("start_i,end_i interval out of bounds") to_start_i = clip_to_iv(to_ivs[si], to_ivs[si].start_i + (from_start_i - from_ivs[si].start_i)) to_end_i = clip_to_iv(to_ivs[ei], to_ivs[ei].end_i - (from_ivs[ei].end_i - from_end_i)) return to_start_i, to_end_i
def n_to_c(self, n_interval, strict_bounds=None): """convert a transcript cDNA (n.) interval to a transcript CDS (c.) interval""" if strict_bounds is None: strict_bounds = global_config.mapping.strict_bounds if self.cds_start_i is None: # cds_start_i defined iff cds_end_i defined; see assertion above raise HGVSUsageError( "CDS is undefined for {self.tx_ac}; cannot map to c. coordinate (non-coding transcript?)" .format(self=self)) if strict_bounds and (n_interval.start.base <= 0 or n_interval.end.base > self.tgt_len): raise HGVSInvalidIntervalError( "The given coordinate is outside the bounds of the reference sequence.") def pos_n_to_c(pos): if pos.base <= self.cds_start_i: c = pos.base - self.cds_start_i - (1 if pos.base > 0 else 0) c_datum = Datum.CDS_START elif pos.base > self.cds_start_i and pos.base <= self.cds_end_i: c = pos.base - self.cds_start_i c_datum = Datum.CDS_START else: c = pos.base - self.cds_end_i c_datum = Datum.CDS_END return hgvs.location.BaseOffsetPosition(base=c, offset=pos.offset, datum=c_datum) c_interval = hgvs.location.BaseOffsetInterval(start=pos_n_to_c(n_interval.start), end=pos_n_to_c(n_interval.end), uncertain=n_interval.uncertain) return c_interval
def __init__(self, ref, tgt): if not ((ref.len == tgt.len) or (ref.len == 0 and tgt.len != 0) or (ref.len != 0 and tgt.len == 0)): raise HGVSInvalidIntervalError( "IntervalPair doesn't represent a match, insertion, or deletion" ) self.ref = ref self.tgt = tgt
def check_datum(self): # check for valid combinations of start and end datums if (self.start.datum, self.end.datum) not in [ (Datum.SEQ_START, Datum.SEQ_START), (Datum.CDS_START, Datum.CDS_START), (Datum.CDS_START, Datum.CDS_END), (Datum.CDS_END, Datum.CDS_END), ]: raise HGVSInvalidIntervalError("BaseOffsetInterval start datum and end datum are incompatible")
def iv_map(from_ivs, to_ivs, from_start_i, from_end_i, max_extent): """returns the <start,end> intervals indexes in which from_start_i and from_end_i occur""" # first look for 0-width interval that matches seil = [i for i, iv in enumerate(from_ivs) if iv.start_i == from_start_i and iv.end_i == from_end_i] if len(seil) > 0: si = ei = seil[0] else: sil = [i for i, iv in enumerate(from_ivs) if iv.start_i <= from_start_i <= iv.end_i] eil = [i for i, iv in enumerate(from_ivs) if iv.start_i <= from_end_i <= iv.end_i] if len(sil) == 0 or len(eil) == 0: raise HGVSInvalidIntervalError("start or end or both are beyond the bounds of transcript record") si, ei = (sil[0], eil[-1]) if max_extent else (sil[-1], eil[0]) return si, ei
def pos_c_to_n(pos): if pos.datum == Datum.CDS_START: n = pos.base + self.cds_start_i if pos.base < 0: # correct for lack of c.0 coordinate n += 1 elif pos.datum == Datum.CDS_END: n = pos.base + self.cds_end_i if n <= 0: # correct for lack of n.0 coordinate n -= 1 if (n <= 0 or n > self.tgt_len): if strict_bounds: raise HGVSInvalidIntervalError(f"c.{pos} coordinate is out of bounds") return hgvs.location.BaseOffsetPosition(base=n, offset=pos.offset, datum=Datum.SEQ_START)
def c_to_n(self, c_interval, strict_bounds=None): """convert a transcript CDS (c.) interval to a transcript cDNA (n.) interval""" if strict_bounds is None: strict_bounds = global_config.mapping.strict_bounds if self.cds_start_i is None: # cds_start_i defined iff cds_end_i defined; see assertion above raise HGVSUsageError( "CDS is undefined for {self.tx_ac}; cannot map from c. coordinate (non-coding transcript?)" .format(self=self)) # start if c_interval.start.datum == Datum.CDS_START and c_interval.start.base < 0: n_start = c_interval.start.base + self.cds_start_i + 1 elif c_interval.start.datum == Datum.CDS_START and c_interval.start.base > 0: n_start = c_interval.start.base + self.cds_start_i elif c_interval.start.datum == Datum.CDS_END: n_start = c_interval.start.base + self.cds_end_i # end if c_interval.end.datum == Datum.CDS_START and c_interval.end.base < 0: n_end = c_interval.end.base + self.cds_start_i + 1 elif c_interval.end.datum == Datum.CDS_START and c_interval.end.base > 0: n_end = c_interval.end.base + self.cds_start_i elif c_interval.end.datum == Datum.CDS_END: n_end = c_interval.end.base + self.cds_end_i if strict_bounds and (n_start <= 0 or n_end > self.tgt_len): raise HGVSInvalidIntervalError( "The given coordinate is outside the bounds of the reference sequence." ) n_interval = hgvs.location.BaseOffsetInterval( start=hgvs.location.BaseOffsetPosition( base=n_start, offset=c_interval.start.offset, datum=Datum.SEQ_START), end=hgvs.location.BaseOffsetPosition(base=n_end, offset=c_interval.end.offset, datum=Datum.SEQ_START), uncertain=c_interval.uncertain) return n_interval
def n_to_c(self, n_interval): """convert a transcript cDNA (n.) interval to a transcript CDS (c.) interval""" if self.cds_start_i is None: # cds_start_i defined iff cds_end_i defined; see assertion above raise HGVSUsageError( "CDS is undefined for {self.tx_ac}; cannot map to c. coordinate (non-coding transcript?)" .format(self=self)) if n_interval.start.base <= 0 or n_interval.end.base > self.tgt_len: raise HGVSInvalidIntervalError( "The given coordinate is outside the bounds of the reference sequence." ) # start if n_interval.start.base <= self.cds_start_i: cs = n_interval.start.base - (self.cds_start_i + 1) cs_datum = Datum.CDS_START elif n_interval.start.base > self.cds_start_i and n_interval.start.base <= self.cds_end_i: cs = n_interval.start.base - self.cds_start_i cs_datum = Datum.CDS_START else: cs = n_interval.start.base - self.cds_end_i cs_datum = Datum.CDS_END # end if n_interval.end.base <= self.cds_start_i: ce = n_interval.end.base - (self.cds_start_i + 1) ce_datum = Datum.CDS_START elif n_interval.end.base > self.cds_start_i and n_interval.end.base <= self.cds_end_i: ce = n_interval.end.base - self.cds_start_i ce_datum = Datum.CDS_START else: ce = n_interval.end.base - self.cds_end_i ce_datum = Datum.CDS_END c_interval = hgvs.location.BaseOffsetInterval( start=hgvs.location.BaseOffsetPosition( base=cs, offset=n_interval.start.offset, datum=cs_datum), end=hgvs.location.BaseOffsetPosition(base=ce, offset=n_interval.end.offset, datum=ce_datum), uncertain=n_interval.uncertain) return c_interval
def __init__(self, start_i, end_i): if not (start_i <= end_i): raise HGVSInvalidIntervalError( "start_i must be less than or equal to end_i") self.start_i = start_i self.end_i = end_i