def test_retrieve_searched_region(self):
        mapped_regions_1 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=5,
                vcf_record_ref="TAT",
                vcf_record_alt="GCCAC",
            ),
        ]

        mapped_regions_2 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=200)
        ]

        searcher = SearchableSeqRegionsMap({
            "chr1": mapped_regions_1,
            "chr2": mapped_regions_2
        })

        vcf_record = _MockVcfRecord(pos=100, ref="T", alts=["A"], chrom="chr2")
        its_index = searcher.bisect(vcf_record.chrom, vcf_record.pos,
                                    BisectTarget.PERS_REF)
        self.assertEqual(searcher.get_region(vcf_record.chrom, its_index),
                         mapped_regions_2[0])

        vcf_record = _MockVcfRecord(pos=4, ref="C", alts=["A"], chrom="chr1")
        its_index = searcher.bisect(vcf_record.chrom, vcf_record.pos,
                                    BisectTarget.PERS_REF)
        self.assertEqual(searcher.get_region(vcf_record.chrom, its_index),
                         mapped_regions_1[1])
    def test_base_ref_pers_ref_same_results(self):
        mapped_regions = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=3,
                vcf_record_ref="TAT",
                vcf_record_alt="GCC",
            ),
            SeqRegion(base_ref_start=5, pers_ref_start=5, length=3),
        ]

        vcf_record_in_var_region = _MockVcfRecord(pos=2, ref="GC", alts=["GA"])
        vcf_record_in_nonvar_region = _MockVcfRecord(pos=1,
                                                     ref="A",
                                                     alts=["T"])
        searcher = SearchableSeqRegionsMap({"JAC": mapped_regions})

        for target in BisectTarget:
            result = searcher.bisect("JAC", vcf_record_in_var_region.pos,
                                     target)
            self.assertEqual(1, result)

        for target in BisectTarget:
            result = searcher.bisect("JAC", vcf_record_in_nonvar_region.pos,
                                     target)
            self.assertEqual(0, result)
    def test_base_ref_further_than_pers_ref(self):
        mapped_regions = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=5,
                vcf_record_ref="TAT",
                vcf_record_alt="GCCAC",
            ),
            SeqRegion(
                base_ref_start=5,
                pers_ref_start=7,
                length=3,
                vcf_record_ref="G",
                vcf_record_alt="TTT",
            ),
        ]

        vcf_record = _MockVcfRecord(pos=6, ref="T", alts=["A"])
        searcher = SearchableSeqRegionsMap({"JAC": mapped_regions})

        pers_ref_result = searcher.bisect("JAC", vcf_record.pos,
                                          BisectTarget.PERS_REF)
        self.assertEqual(1, pers_ref_result)

        base_ref_result = searcher.bisect("JAC", vcf_record.pos,
                                          BisectTarget.BASE_REF)
        self.assertEqual(2, base_ref_result)
示例#4
0
def _rebase_vcf_record(
    vcf_record: VariantRecord, chrom: Chrom, region_searcher: SearchableSeqRegionsMap
):
    """Change `vcf_record` to be expressed relative to a different reference."""

    # Get index of personalised ref region containing the start of the `vcf_record`.
    region_index = region_searcher.bisect(chrom, vcf_record.pos, BisectTarget.PERS_REF)

    consumed_reference = 0  # Position in the inferred ref. sequence
    reference_length = len(vcf_record.ref)
    ref_seq_left = True

    # Build the rebased_ref as we traverse the regions,
    # using the vcf_record alt and pre-pend and/or post-pend to it if necessary.
    rebased_ref, rebased_alt = "", str(vcf_record.alts[0])

    # Let's rebase the position straight away
    first_region = region_searcher.get_region(chrom, region_index)

    # Case: hitting variant region. We rebase at the beginning of the variant region.
    if first_region.is_variant_region:
        rebased_pos = first_region.base_ref_start

        # We also straight away pre-pend any preceding variation relative to the base ref
        if vcf_record.pos > first_region.pers_ref_start:
            record_inset = vcf_record.pos - first_region.pers_ref_start
            rebased_alt = first_region.vcf_record_alt[:record_inset] + rebased_alt

    # Case: hitting non-variant region. We rebase at where the vcf_record starts, in base ref coordinates.
    else:
        rebased_pos = first_region.base_ref_start + (
            vcf_record.pos - first_region.pers_ref_start
        )

    while ref_seq_left:
        region = region_searcher.get_region(chrom, region_index)
        # Check how much of the vcf_record ref (inferred reference) can be consumed by the current region.
        # If the current region can consume at least what is left of the vcf_record ref, loop ends.
        # NOTE that region.length is 'overloaded': if a non-var region, it is the fixed interval between var regions.
        # If a var region, it is the inferred_vcf record's alt length (ref and alt lengths can differ).
        consumable = region.length - (
            vcf_record.pos + consumed_reference - region.pers_ref_start
        )

        if consumable >= (reference_length - consumed_reference):
            ref_seq_left = False
            to_consume = reference_length - consumed_reference
        else:
            to_consume = consumable

        if region.is_variant_region:
            rebased_ref += region.vcf_record_ref

        else:
            # We can use the vcf_record's ref, as that is also the base ref sequence- because we are in a non-variant site.
            rebased_ref += vcf_record.ref[
                consumed_reference : consumed_reference + to_consume
            ]

        consumed_reference += to_consume
        region_index += 1

    assert consumed_reference == len(vcf_record.ref)

    # Deal with the last region: post-pend any sequence in alt record if we finish in a variant site.
    if region.is_variant_region:
        cur_pos = vcf_record.pos + consumed_reference
        # The inset will be < 0 if there is a part of the (inferred vcf record's) alt which has not been
        inset = cur_pos - (region.pers_ref_start + region.length)
        if inset < 0:
            rebased_alt += region.vcf_record_alt[inset:]

    vcf_record = _modify_vcf_record(
        vcf_record, pos=rebased_pos, ref=rebased_ref, alts=[rebased_alt]
    )

    return vcf_record
示例#5
0
def _rebase_vcf_record(vcf_record: VariantRecord, chrom: Chrom,
                       region_searcher: SearchableSeqRegionsMap):
    """
    Changes `vcf_record` to be expressed relative to a different reference.

    The algorithm is not trivial to understand- refer to the tests to understand
    expected inputs/outputs and to figures/text in the gramtools PhD thesis for details.

    Brief explanation:
       Notation:
          - base reference = reference on which to rebase
          - personalised reference = reference on which `vcf_record` variation lies
       Goal: get base reference sequence and position and new alt sequence for `vcf_record`
       Functioning:
          - We use a map (`region_searcher`) storing coordinates/sequences in both reference spaces
          - We initially bisect into the map at first position <= `vcf_record`'pos in
            personalised reference space. The algorithm then goes through the map until
            it reaches the end of `vcf_record`'s pos, constructing new sequence along
            the way.
          - If we start or end in a variant site in base reference space, we use all of
            the ref/alt sequences in the new ref/alt sequences, because we need to carry
            over variation that exists in the personalised reference
          - If we start or end in an invariant site, we only use sequence from/up to the
            `vcf_record`'s ref sequence to paste in to the new base reference sequence.
    """
    cur_region_index = region_searcher.bisect(chrom, vcf_record.pos,
                                              BisectTarget.PERS_REF)
    cur_region = region_searcher.get_region(chrom, cur_region_index)

    new_ref_seq = ""
    new_alt_seq = str(vcf_record.alts[0])
    cur_pers_ref_pos = vcf_record.pos

    pers_ref_end_pos = cur_pers_ref_pos + len(vcf_record.ref) - 1
    new_pos = cur_region.base_ref_start

    num_bases_past_first_region = cur_pers_ref_pos - cur_region.pers_ref_start
    if num_bases_past_first_region > 0:
        if cur_region.is_variant_region:
            new_alt_seq = (
                cur_region.vcf_record_alt[:num_bases_past_first_region] +
                new_alt_seq)
        else:
            new_pos += num_bases_past_first_region

    while cur_pers_ref_pos <= pers_ref_end_pos:
        cur_region = region_searcher.get_region(chrom, cur_region_index)
        cur_region_end = cur_region.pers_ref_start + cur_region.length - 1
        num_bases_past_last_region = max(cur_region_end - pers_ref_end_pos, 0)
        if cur_region.is_variant_region:
            new_ref_seq += cur_region.vcf_record_ref
        else:
            start_offset = cur_pers_ref_pos - vcf_record.pos
            end_offset = cur_region_end - vcf_record.pos - num_bases_past_last_region
            new_ref_seq += vcf_record.ref[start_offset:end_offset + 1]
        if num_bases_past_last_region > 0 and cur_region.is_variant_region:
            offset = cur_region.length - num_bases_past_last_region
            new_alt_seq = new_alt_seq + cur_region.vcf_record_alt[offset:]
        cur_pers_ref_pos = cur_region_end + 1
        cur_region_index += 1

    vcf_record = _modify_vcf_record(vcf_record,
                                    pos=new_pos,
                                    ref=new_ref_seq,
                                    alts=[new_alt_seq])
    return vcf_record