예제 #1
0
 def __lt__(lhs, rhs):
     assert type(lhs) == type(
         rhs), "Cannot compare coordinates of different representations"
     if lhs.uncertain or rhs.uncertain:
         raise HGVSUnsupportedOperationError(
             "Cannot compare coordinates of uncertain positions")
     if lhs.datum == rhs.datum:
         if lhs.base == rhs.base:
             return lhs.offset < rhs.offset
         else:
             if ((rhs.base - lhs.base == 1 and lhs.offset > 0
                  and rhs.offset < 0)
                     or (lhs.base - rhs.base == 1 and rhs.offset > 0
                         and lhs.offset < 0)):
                 raise HGVSUnsupportedOperationError(
                     "Cannot compare coordinates in the same intron with one based on end of exon and the other based on start of next exon"
                 )
             else:
                 return lhs.base < rhs.base
     else:
         if lhs.datum == Datum.SEQ_START or rhs.datum == Datum.SEQ_START:
             raise HGVSUnsupportedOperationError(
                 "Cannot compare coordinates of datum SEQ_START with CDS_START or CDS_END"
             )
         else:
             return lhs.datum < rhs.datum
예제 #2
0
 def __sub__(lhs, rhs):
     assert type(lhs) == type(
         rhs), "Cannot substract coordinates of different representations"
     if lhs.datum != rhs.datum:
         raise HGVSUnsupportedOperationError(
             "Interval length measured from different datums is ill-defined"
         )
     if lhs.base == rhs.base:
         return lhs.offset - rhs.offset
     if lhs.offset != 0 or rhs.offset != 0:
         raise HGVSUnsupportedOperationError(
             "Interval length with intronic offsets is ill-defined")
     straddles_zero = 1 if (lhs.base > 0 and rhs.base < 0) else 0
     return lhs.base - rhs.base - straddles_zero
예제 #3
0
 def __lt__(lhs, rhs):
     assert type(lhs) == type(
         rhs), "Cannot compare coordinates of different representations"
     if lhs.uncertain or rhs.uncertain:
         raise HGVSUnsupportedOperationError(
             "Cannot compare coordinates of uncertain positions")
     return lhs.base < rhs.base
예제 #4
0
    def _get_altered_sequence(self, strand, interval, var):
        seq = list(
            self.hdp.get_seq(var.ac, interval.start.base - 1,
                             interval.end.base))
        # positions are 0-based and half-open
        pos_start = var.posedit.pos.start.base - interval.start.base
        pos_end = var.posedit.pos.end.base - interval.start.base + 1
        edit = var.posedit.edit

        if edit.type == 'sub':
            seq[pos_start] = edit.alt
        elif edit.type == 'del':
            del seq[pos_start:pos_end]
        elif edit.type == 'ins':
            seq.insert(pos_start + 1, edit.alt)
        elif edit.type == 'delins':
            del seq[pos_start:pos_end]
            seq.insert(pos_start, edit.alt)
        elif edit.type == 'dup':
            seq.insert(pos_end, ''.join(seq[pos_start:pos_end]))
        elif edit.type == 'inv':
            seq[pos_start:pos_end] = list(
                reverse_complement(''.join(seq[pos_start:pos_end])))
        elif edit.type == 'identity':
            pass
        else:
            raise HGVSUnsupportedOperationError(
                "Getting altered sequence for {type} is unsupported".format(
                    type=edit.type))

        seq = ''.join(seq)
        if strand == -1:
            seq = reverse_complement(seq)
        return seq
예제 #5
0
    def n_to_c(self, var_n):
        """Given a parsed n. variant, return a c. variant on the specified
        transcript using the specified alignment method (default is
        "transcript" indicating a self alignment).

        :param hgvs.sequencevariant.SequenceVariant var_n: a variant object
        :returns: variant object (:class:`vvhgvs.sequencevariant.SequenceVariant`)
        :raises HGVSInvalidVariantError: if var_n is not of type "n"

        """

        if not (var_n.type == "n"):
            raise HGVSInvalidVariantError("Expected n. variant; got " +
                                          str(var_n))
        if self._validator:
            self._validator.validate(var_n)
        var_n.fill_ref(self.hdp)
        tm = self._fetch_AlignmentMapper(tx_ac=var_n.ac,
                                         alt_ac=var_n.ac,
                                         alt_aln_method="transcript")
        pos_c = tm.n_to_c(var_n.posedit.pos)
        if (isinstance(var_n.posedit.edit, vvhgvs.edit.NARefAlt)
                or isinstance(var_n.posedit.edit, vvhgvs.edit.Dup)
                or isinstance(var_n.posedit.edit, vvhgvs.edit.Inv)):
            edit_c = copy.deepcopy(var_n.posedit.edit)
        else:
            raise HGVSUnsupportedOperationError(
                "Only NARefAlt/Dup/Inv types are currently implemented")
        var_c = vvhgvs.sequencevariant.SequenceVariant(
            ac=var_n.ac,
            type="c",
            posedit=vvhgvs.posedit.PosEdit(pos_c, edit_c))
        if self.replace_reference:
            self._replace_reference(var_c)
        return var_c
예제 #6
0
    def _replace_reference(self, var):
        """fetch reference sequence for variant and update (in-place) if necessary"""

        if var.type not in "cgmnr":
            raise HGVSUnsupportedOperationError(
                "Can only update references for type c, g, m, n, r")

        if var.posedit.edit.type == "ins":
            # insertions have no reference sequence (zero-width), so return as-is
            return var
        if var.posedit.edit.type == "con":
            # conversions have no reference sequence (zero-width), so return as-is
            return var

        pos = var.posedit.pos
        if ((isinstance(pos.start, vvhgvs.location.BaseOffsetPosition)
             and pos.start.offset != 0)
                or (isinstance(pos.end, vvhgvs.location.BaseOffsetPosition)
                    and pos.end.offset != 0)):
            _logger.info(
                "Can't update reference sequence for intronic variant {}".
                format(var))
            return var

        # For c. variants, we need coords on underlying sequences
        if var.type == "c":
            tm = self._fetch_AlignmentMapper(tx_ac=var.ac,
                                             alt_ac=var.ac,
                                             alt_aln_method="transcript")
            pos = tm.c_to_n(var.posedit.pos)
        else:
            pos = var.posedit.pos
        seq = self.hdp.get_seq(var.ac, pos.start.base - 1, pos.end.base)

        edit = var.posedit.edit
        if edit.ref != seq:
            _logger.debug(
                "Replaced reference sequence in {var} with {seq}".format(
                    var=var, seq=seq))
            edit.ref = seq

        return var
예제 #7
0
 def _del_ins_lengths(self, ilen):
     raise HGVSUnsupportedOperationError(
         "internal function _del_ins_lengths not implemented for this variant type"
     )
예제 #8
0
    def normalize(self, var):
        """Perform sequence variants normalization for single variant
        """
        assert isinstance(
            var, vvhgvs.sequencevariant.SequenceVariant
        ), "variant must be a parsed HGVS sequence variant object"

        if self.validator:
            self.validator.validate(var)

        if var.posedit is None or var.posedit.uncertain or var.posedit.pos is None:
            return var

        type = var.type

        if type == "p":
            raise HGVSUnsupportedOperationError(
                "Unsupported normalization of protein level variants: {0}".
                format(var))
        if var.posedit.edit.type == "con":
            raise HGVSUnsupportedOperationError(
                "Unsupported normalization of conversion variants: {0}",
                format(var))

        var.fill_ref(self.hdp)

        if var.posedit.edit.type == "identity":
            var_norm = copy.deepcopy(var)
            return var_norm

        # For c. variants normalization, first convert to n. variant
        # and perform normalization at the n. level, then convert the
        # normalized n. variant back to c. variant.
        if type == "c":
            var = self.hm.c_to_n(var)

        if var.type in "nr":
            if var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0:
                raise HGVSUnsupportedOperationError(
                    "Normalization of intronic variants is not supported")

        # g, m, n, r sequences all use sequence start as the datum
        # That"s an essential assumption herein
        # (this is why we may have converted from c to n above)
        assert var.type in "gmnr", "Internal Error: variant must be of type g, m, n, r"

        bound_s, bound_e = self._get_boundary(var)
        boundary = (bound_s, bound_e)
        start, end, (ref, alt) = self._normalize_alleles(var, boundary)

        ref_len = len(ref)
        alt_len = len(alt)

        # Generate normalized variant
        if alt_len == ref_len:
            ref_start = start
            ref_end = end - 1
            # inversion
            if ref_len > 1 and ref == reverse_complement(alt):
                edit = vvhgvs.edit.Inv(ref=ref)
            # ident
            elif ref_len == 0 and alt_len == 0:
                ref_start = ref_end
                edit = vvhgvs.edit.NARefAlt(ref=ref, alt=alt)
            # substitution or delins
            else:
                edit = vvhgvs.edit.NARefAlt(ref=ref, alt=alt)
        if alt_len < ref_len:
            # del or delins
            ref_start = start
            ref_end = end - 1
            edit = vvhgvs.edit.NARefAlt(ref=ref,
                                        alt=None if alt_len == 0 else alt)
        elif alt_len > ref_len:
            # ins or dup
            if ref_len == 0:
                if self.shuffle_direction == 3:
                    adj_seq = self._fetch_bounded_seq(var, start - alt_len - 1,
                                                      end - 1, 0, boundary)
                else:
                    adj_seq = self._fetch_bounded_seq(var, start - 1,
                                                      start + alt_len - 1, 0,
                                                      boundary)
                # ins
                if alt != adj_seq:
                    ref_start = start - 1
                    ref_end = end
                    edit = vvhgvs.edit.NARefAlt(ref=None, alt=alt)
                # dup
                else:
                    if self.shuffle_direction == 3:
                        ref_start = start - alt_len
                        ref_end = end - 1
                        edit = vvhgvs.edit.Dup(ref=alt)
                    else:
                        ref_start = start
                        ref_end = start + alt_len - 1
                        edit = vvhgvs.edit.Dup(ref=alt)
            # delins
            else:
                ref_start = start
                ref_end = end - 1
                edit = vvhgvs.edit.NARefAlt(ref=ref, alt=alt)

        # ensure the start is not 0
        if ref_start == 0:
            ref = self._fetch_bounded_seq(var, 0, 1, 0, boundary)
            alt = alt + ref
            edit = vvhgvs.edit.NARefAlt(ref=ref, alt=alt)
            ref_start = 1
            ref_end = 1

        # ensure the end is not outside of reference sequence
        tgt_len = self._get_tgt_length(var)
        if ref_end == tgt_len + 1:
            ref = self._fetch_bounded_seq(var, tgt_len - 1, tgt_len, 0,
                                          boundary)
            alt = ref + alt
            edit = vvhgvs.edit.NARefAlt(ref=ref, alt=alt)
            ref_start = tgt_len
            ref_end = tgt_len

        var_norm = copy.deepcopy(var)
        var_norm.posedit.edit = edit
        var_norm.posedit.pos.start.base = ref_start
        var_norm.posedit.pos.end.base = ref_end

        if type == "c":
            var_norm = self.hm.n_to_c(var_norm)

        return var_norm
예제 #9
0
    def _get_boundary(self, var):
        """Get the position of exon-intron boundary for current variant
        """
        if var.type == "r" or var.type == "n":
            if self.cross_boundaries:
                return 0, float("inf")
            else:
                # Get genomic sequence access number for this transcript
                map_info = self.hdp.get_tx_mapping_options(var.ac)
                if not map_info:
                    raise HGVSDataNotAvailableError(
                        "No mapping info available for {ac}".format(ac=var.ac))
                map_info = [
                    item for item in map_info
                    if item["alt_aln_method"] == self.alt_aln_method
                ]
                alt_ac = map_info[0]["alt_ac"]

                # Get tx info
                tx_info = self.hdp.get_tx_info(var.ac, alt_ac,
                                               self.alt_aln_method)
                cds_start = tx_info["cds_start_i"]
                cds_end = tx_info["cds_end_i"]

                # Get exon info
                exon_info = self.hdp.get_tx_exons(var.ac, alt_ac,
                                                  self.alt_aln_method)
                exon_starts = [exon["tx_start_i"] for exon in exon_info]
                exon_ends = [exon["tx_end_i"] for exon in exon_info]
                exon_starts.sort()
                exon_ends.sort()
                exon_starts.append(exon_ends[-1])
                exon_ends.append(float("inf"))

                # Find the end pos of the exon where the var locates
                left = 0
                right = float("inf")

                # TODO: #242: implement methods to find tx regions
                for i in range(0, len(exon_starts)):
                    if (var.posedit.pos.start.base - 1 >= exon_starts[i]
                            and var.posedit.pos.start.base - 1 < exon_ends[i]):
                        break

                for j in range(0, len(exon_starts)):
                    if (var.posedit.pos.end.base - 1 >= exon_starts[j]
                            and var.posedit.pos.end.base - 1 < exon_ends[j]):
                        break

                if i != j:
                    raise HGVSUnsupportedOperationError(
                        "Unsupported normalization of variants spanning the exon-intron boundary ({var})"
                        .format(var=var))

                left = exon_starts[i]
                right = exon_ends[i]

                if cds_start is None:
                    pass
                elif var.posedit.pos.end.base - 1 < cds_start:
                    right = min(right, cds_start)
                elif var.posedit.pos.start.base - 1 >= cds_start:
                    left = max(left, cds_start)
                else:
                    raise HGVSUnsupportedOperationError(
                        "Unsupported normalization of variants spanning the UTR-exon boundary ({var})"
                        .format(var=var))

                if cds_end is None:
                    pass
                elif var.posedit.pos.start.base - 1 >= cds_end:
                    left = max(left, cds_end)
                elif var.posedit.pos.end.base - 1 < cds_end:
                    right = min(right, cds_end)
                else:
                    raise HGVSUnsupportedOperationError(
                        "Unsupported normalization of variants spanning the exon-UTR boundary ({var})"
                        .format(var=var))

                return left, right
        else:
            # For variant type of g and m etc.
            return 0, float("inf")