Exemplo n.º 1
0
    def test_retrieve_searched_region(self):
        mapped_regions_1 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=5,
                vcf_record_ref="TAT",
                vcf_record_alt="GCCAC",
            ),
        ]

        mapped_regions_2 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=200)
        ]

        searcher = SearchableSeqRegionsMap({
            "chr1": mapped_regions_1,
            "chr2": mapped_regions_2
        })

        vcf_record = _MockVcfRecord(pos=100, ref="T", alts=["A"], chrom="chr2")
        its_index = searcher.bisect(vcf_record.chrom, vcf_record.pos,
                                    BisectTarget.PERS_REF)
        self.assertEqual(searcher.get_region(vcf_record.chrom, its_index),
                         mapped_regions_2[0])

        vcf_record = _MockVcfRecord(pos=4, ref="C", alts=["A"], chrom="chr1")
        its_index = searcher.bisect(vcf_record.chrom, vcf_record.pos,
                                    BisectTarget.PERS_REF)
        self.assertEqual(searcher.get_region(vcf_record.chrom, its_index),
                         mapped_regions_1[1])
Exemplo n.º 2
0
    def test_multiple_deletions(self):
        """
        A test case where we discover a deletion on top of a deletion in a variant site;
        as well as an extra deletion in a non-variant site.

        There is also a SNP among the original deletion, to make it plausible that quasimap/infer picks this variant.

        To make it harder, the discovered variation is also reported inside a variant site, so we expect the rebased alt to be elongated.

        We expect the rebased ref to include all deleted bases.
        """
        # base reference:     CAA C GCTA CAA
        # inferred reference: C   C GAT  CAA

        chrom_sizes = {"JAC": 11}
        base_records = [
            _MockVcfRecord(pos=1, ref="CAA", alts=["C"]),
            _MockVcfRecord(pos=5, ref="GCTA", alts=["GAT"]),
        ]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=4, ref="ATC", alts=["A"])
        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(pos=5, ref="GCTAC", alts=["GA"])

        self.assertEqual(expected, result)
Exemplo n.º 3
0
    def test_SNP_OnTopOfIndel(self):
        """
        A test case where we find a SNP on top of an insertion in the inferred reference.

        What we need is for the rebased alt to include the flanking alt bases, which are implied to be present in the discov_record.
        """
        # base sequ: T TAT CGG T     A
        # secondary: T G   CGG TCTGC A
        chrom_sizes = {"JAC": 9}
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["G"]),
            _MockVcfRecord(pos=8, ref="T", alts=["TCTGC"]),
        ]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=9, ref="G", alts=["A"])

        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(8, "T", ["TCTAC"])

        self.assertEqual(expected, result)
Exemplo n.º 4
0
    def test_base_ref_pers_ref_same_results(self):
        mapped_regions = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=3,
                vcf_record_ref="TAT",
                vcf_record_alt="GCC",
            ),
            SeqRegion(base_ref_start=5, pers_ref_start=5, length=3),
        ]

        vcf_record_in_var_region = _MockVcfRecord(pos=2, ref="GC", alts=["GA"])
        vcf_record_in_nonvar_region = _MockVcfRecord(pos=1,
                                                     ref="A",
                                                     alts=["T"])
        searcher = SearchableSeqRegionsMap({"JAC": mapped_regions})

        for target in BisectTarget:
            result = searcher.bisect("JAC", vcf_record_in_var_region.pos,
                                     target)
            self.assertEqual(1, result)

        for target in BisectTarget:
            result = searcher.bisect("JAC", vcf_record_in_nonvar_region.pos,
                                     target)
            self.assertEqual(0, result)
Exemplo n.º 5
0
    def test_TwoRecords_CorrectRegions(self):
        # base sequence:      T TAT    C G   G
        # derived sequence:   T GCCAC  C TTT G
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["GCCAC"]),
            _MockVcfRecord(pos=6, ref="G", alts=["TTT"]),
        ]

        chrom_sizes = {"JAC": 7}
        result = SeqRegionMapper(base_records, chrom_sizes).get_map()

        expected = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=5,
                vcf_record_ref="TAT",
                vcf_record_alt="GCCAC",
            ),
            SeqRegion(base_ref_start=5, pers_ref_start=7, length=1),
            SeqRegion(
                base_ref_start=6,
                pers_ref_start=8,
                length=3,
                vcf_record_ref="G",
                vcf_record_alt="TTT",
            ),
            SeqRegion(base_ref_start=7, pers_ref_start=11, length=1),
        ]

        self.assertEqual(expected, result["JAC"])
Exemplo n.º 6
0
 def test_two_snps_same_chrom(self, mock_var_file, mock_load_fasta):
     recs = [
         _MockVcfRecord(1, "A", "G", chrom="ref1"),
         _MockVcfRecord(3, "C", ["T", "G"], chrom="ref1"),
     ]
     mock_var_file.return_value.fetch.return_value = iter(recs)
     mock_load_fasta.return_value = self.chroms
     converter = Vcf_to_prg("", "", "")
     self.assertEqual("5A6G6G7C8T8G8AGCCCCGGG", converter._get_string())
Exemplo n.º 7
0
 def test_snp_inside_del(self, mock_var_file, mock_load_fasta):
     recs = [
         _MockVcfRecord(2, "T", ["G"], chrom="ref1"),
         _MockVcfRecord(2, "T", ["C"], chrom="ref1"),
     ]
     mock_var_file.return_value.fetch.return_value = iter(recs)
     mock_load_fasta.return_value = self.chroms
     converter = Vcf_to_prg("", "", "")
     self.assertEqual("T5T6G6TTCCC", converter._get_string())
Exemplo n.º 8
0
 def test_adjacent_snps_kept(self, mock_var_file, mock_load_fasta):
     recs = [
         _MockVcfRecord(1, "C", ["G"], chrom="ref2"),
         _MockVcfRecord(2, "C", ["A"], chrom="ref2"),
     ]
     mock_var_file.return_value.fetch.return_value = iter(recs)
     mock_load_fasta.return_value = self.chroms
     converter = Vcf_to_prg("", "", "")
     self.assertEqual("AGCAGC5C6G67C8A8CGGG", converter._get_string())
Exemplo n.º 9
0
 def test_snps_at_same_position(self, mock_var_file, mock_load_fasta):
     recs = [
         _MockVcfRecord(1, "TTTT", ["T"], chrom="ref1"),
         _MockVcfRecord(2, "T", ["C"], chrom="ref1"),
     ]
     mock_var_file.return_value.fetch.return_value = iter(recs)
     mock_load_fasta.return_value = self.chroms
     converter = Vcf_to_prg("", "", "")
     self.assertEqual("5TTTT6T6CCC", converter._get_string())
Exemplo n.º 10
0
 def test_one_ins_and_one_del_diff_chroms(self, mock_var_file,
                                          mock_load_fasta):
     recs = [
         _MockVcfRecord(3, "C", ["CGG"], chrom="ref1"),
         _MockVcfRecord(1, "CCC", ["C"], chrom="ref2"),
     ]
     mock_var_file.return_value.fetch.return_value = iter(recs)
     mock_load_fasta.return_value = self.chroms
     converter = Vcf_to_prg("", "", "")
     self.assertEqual("AG5C6CGG6AGC7CCC8C8GGG", converter._get_string())
 def test_rebasing_in_unknown_chromosome_fails(self):
     base_records = [
         _MockVcfRecord(pos=2, ref="T", alts=["G"], chrom="chr0")
     ]
     discov_record = _MockVcfRecord(pos=3,
                                    ref="C",
                                    alts=["G"],
                                    chrom="chr1")
     with self.assertRaises(KeyError):
         result = run_rebase(discov_record, base_records, chrom_sizes=[5])
    def test_VariantInInvariantChromosome(self):
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["G"], chrom="chr0")
        ]
        discov_record = _MockVcfRecord(pos=1,
                                       ref="TT",
                                       alts=["GA"],
                                       chrom="chr1")
        result = run_rebase(discov_record, base_records, chrom_sizes=[7, 5])

        self.assertEqual(discov_record, result)
    def test_rebasing_with_too_short_chromosome_fails(self):
        # base sequence:      AA ATCTA
        # derived sequence:   T  ATCTA
        base_records = [
            _MockVcfRecord(pos=1, ref="AA", alts=["T"], chrom="chr0")
        ]
        discov_record = _MockVcfRecord(pos=4,
                                       ref="C",
                                       alts=["G"],
                                       chrom="chr0")

        with self.assertRaises(IndexError):
            result = run_rebase(discov_record, base_records, chrom_sizes=[3])
    def test_VariantCoveringAllOfVariantRegion(self):
        # base sequence:      - TAT ---
        # derived sequence:   - G   ---
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["G"], chrom="chr0")
        ]
        discov_record = _MockVcfRecord(pos=3,
                                       ref="G",
                                       alts=["C"],
                                       chrom="chr0")

        result = run_rebase(discov_record, base_records, chrom_sizes=[7])
        expected = _MockVcfRecord(pos=5, ref="G", alts=["C"], chrom="chr0")
        self.assertEqual(expected, result)
    def test_VariantCoveringPartOfInvariantRegion(self):
        # base sequence:      AA ATCTA
        # derived sequence:   T  ATCTA
        base_records = [
            _MockVcfRecord(pos=1, ref="AA", alts=["T"], chrom="chr0")
        ]
        discov_record = _MockVcfRecord(pos=4,
                                       ref="C",
                                       alts=["G"],
                                       chrom="chr0")

        expected = _MockVcfRecord(pos=5, ref="C", alts=["G"], chrom="chr0")
        result = run_rebase(discov_record, base_records, chrom_sizes=[7])
        self.assertEqual(expected, result)
Exemplo n.º 16
0
    def test_base_ref_further_than_pers_ref(self):
        mapped_regions = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=5,
                vcf_record_ref="TAT",
                vcf_record_alt="GCCAC",
            ),
            SeqRegion(
                base_ref_start=5,
                pers_ref_start=7,
                length=3,
                vcf_record_ref="G",
                vcf_record_alt="TTT",
            ),
        ]

        vcf_record = _MockVcfRecord(pos=6, ref="T", alts=["A"])
        searcher = SearchableSeqRegionsMap({"JAC": mapped_regions})

        pers_ref_result = searcher.bisect("JAC", vcf_record.pos,
                                          BisectTarget.PERS_REF)
        self.assertEqual(1, pers_ref_result)

        base_ref_result = searcher.bisect("JAC", vcf_record.pos,
                                          BisectTarget.BASE_REF)
        self.assertEqual(2, base_ref_result)
Exemplo n.º 17
0
    def test_chrom_with_no_records(self):
        """
        Need to map chroms with no initial variation too
        """
        base_records = [
            _MockVcfRecord(pos=2, ref="T", alts=["A"], chrom="Chrom_2")
        ]

        chrom_sizes = {"Chrom_1": 4, "Chrom_2": 5}
        result = SeqRegionMapper(base_records, chrom_sizes).get_map()

        expected_Chrom_1 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=4)
        ]
        expected_Chrom_2 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=1,
                vcf_record_ref="T",
                vcf_record_alt="A",
            ),
            SeqRegion(base_ref_start=3, pers_ref_start=3, length=3),
        ]

        expectations = {
            "Chrom_1": expected_Chrom_1,
            "Chrom_2": expected_Chrom_2
        }
        for key in expectations:
            self.assertEqual(expectations[key], result[key])
    def test_VariantOverlapsTwoRegions_PartOfInvarAllOfVar(self):
        # base sequence:      AAA  AGA A
        # derived sequence:   TTTT AGA C
        # overlap:                  -- -
        base_records = [
            _MockVcfRecord(pos=1, ref="AAA", alts=["TTTT"], chrom="chr0"),
            _MockVcfRecord(pos=7, ref="A", alts=["C"], chrom="chr0"),
        ]
        discov_record = _MockVcfRecord(pos=6,
                                       ref="GAC",
                                       alts=["AAT"],
                                       chrom="chr0")
        result = run_rebase(discov_record, base_records, chrom_sizes=[7])
        expected = _MockVcfRecord(pos=5, ref="GAA", alts=["AAT"], chrom="chr0")

        self.assertEqual(expected, result)
Exemplo n.º 19
0
 def test_one_variant_chroms_with_no_vars_in_same_order(
         self, mock_var_file, mock_load_fasta):
     recs = [_MockVcfRecord(2, "G", ["CAAA", "CA"], chrom="ref3")]
     mock_var_file.return_value.fetch.return_value = iter(recs)
     mock_load_fasta.return_value = self.chroms
     converter = Vcf_to_prg("", "", "")
     self.assertEqual("AGCAGCCCCG5G6CAAA6CA6G", converter._get_string())
 def test_filter_pass_record_kept(self, mock_var_file, mock_load_fasta):
     recs = [_MockVcfRecord(2, "A", "G",
                            chrom="JAC")]  # Default filter: PASS
     mock_var_file.return_value.fetch.return_value = iter(recs)
     mock_load_fasta.return_value = self.chroms
     converter = Vcf_to_prg("", "", "")
     self.assertEqual(converter._get_string(), "A5A6G6C")
Exemplo n.º 21
0
    def test_StartsAtNonSite_EndsAtSite(self):
        # base sequence:      T TAT CGG
        # derived sequence:   T G   CGG
        chrom_sizes = {"JAC": 7}
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=1, ref="TG", alts=["TAA"])
        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(1, "TTAT", ["TAA"])

        self.assertEqual(expected, result)
Exemplo n.º 22
0
    def test_SingleSNPInNonSite(self):
        # base sequence:      T TAT CGG
        # derived sequence:   T G   CGG
        chrom_sizes = {"JAC": 5}
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]
        region_map = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(region_map)

        discov_record = _MockVcfRecord(pos=3, ref="C", alts=["G"])
        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(pos=5, ref="C", alts=["G"])

        self.assertEqual(expected, result)
    def test_VariantOverlapsThreeRegions_InVarThenVarThenInvar_PartialSpan(
            self):
        # base sequ: T TAT GGG T     ATTTT
        # secondary: T GG  GGG TCTGT ATTTT
        # overlap:          -- ----- --

        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["GG"], chrom="chr0"),
            _MockVcfRecord(pos=8, ref="T", alts=["TCTGT"], chrom="chr0"),
        ]
        discov_record = _MockVcfRecord(pos=5,
                                       ref="GGTCTGTAT",
                                       alts=["T"],
                                       chrom="chr0")
        result = run_rebase(discov_record, base_records, chrom_sizes=[13])
        expected = _MockVcfRecord(pos=6, ref="GGTAT", alts=["T"], chrom="chr0")
        self.assertEqual(expected, result)
Exemplo n.º 24
0
class Test_Representation(TestCase):
    chroms = {"ref1": "ACACAA"}
    recs = [
        _MockVcfRecord(1, "A", ["G"], chrom="ref1"),
        _MockVcfRecord(5, "A", ["AAA"], chrom="ref1"),
    ]

    def test_legacy_representation(self, mock_var_file, mock_load_fasta):
        mock_var_file.return_value.fetch.return_value = iter(self.recs)
        mock_load_fasta.return_value = self.chroms
        converter = Vcf_to_prg("", "", "", mode="legacy")
        self.assertEqual("5A6G5CAC7A8AAA7A", converter._get_string())

    def test_integer_representation(self, mock_var_file, mock_load_fasta):
        mock_var_file.return_value.fetch.return_value = iter(self.recs)
        mock_load_fasta.return_value = self.chroms
        converter = Vcf_to_prg("", "", "")
        self.assertEqual([5, 1, 6, 3, 6, 2, 1, 2, 7, 1, 8, 1, 1, 1, 8, 1],
                         converter._get_ints())
Exemplo n.º 25
0
    def test_variant_in_chromo_with_no_prg_variants(self):
        # chr1 base:    T TAT CGG
        # chr1 derived: T G   CGG
        # chr2 base:    TTTTT
        # chr2 derived: TTTTT

        chrom_sizes = {"chr1": 7, "chr2": 5}
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["G"], chrom="chr1")
        ]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=1,
                                       ref="TT",
                                       alts=["GA"],
                                       chrom="chr2")
        new_vcf_record = discover._rebase_vcf_record(discov_record, "chr2",
                                                     region_searcher)
        self.assertEqual(discov_record, new_vcf_record)
Exemplo n.º 26
0
 def test_ref_call_produces_invariant_region_only(self):
     # base sequence:      T TAT CGG
     # derived sequence:   ^^^^^^^^^
     base_records = [
         _MockVcfRecord(pos=2, ref="TAT", alts=["G"], samples=[{
             "GT": [0]
         }])
     ]
     chrom_sizes = {"JAC": 7}
     result = SeqRegionMapper(base_records, chrom_sizes).get_map()
     expected = [SeqRegion(base_ref_start=1, pers_ref_start=1, length=7)]
     self.assertEqual(expected, result["JAC"])
Exemplo n.º 27
0
    def test_ThreeAdjacentRecords_CorrectRegions(self):
        # base sequence:      T TAT    C   G  G
        # derived sequence:   T GCCAC  TCT AA G
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["GCCAC"]),
            _MockVcfRecord(pos=5, ref="C", alts=["TCT"]),
            _MockVcfRecord(pos=6, ref="G", alts=["AA"]),
        ]
        chrom_sizes = {"JAC": 7}
        result = SeqRegionMapper(base_records, chrom_sizes).get_map()

        expected = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=5,
                vcf_record_ref="TAT",
                vcf_record_alt="GCCAC",
            ),
            SeqRegion(
                base_ref_start=5,
                pers_ref_start=7,
                length=3,
                vcf_record_ref="C",
                vcf_record_alt="TCT",
            ),
            SeqRegion(
                base_ref_start=6,
                pers_ref_start=10,
                length=2,
                vcf_record_ref="G",
                vcf_record_alt="AA",
            ),
            SeqRegion(base_ref_start=7, pers_ref_start=12, length=1),
        ]

        self.assertEqual(expected, list(result.values())[0])
Exemplo n.º 28
0
    def test_TwoRecords_TwoDifferentChroms(self):
        base_records = [
            _MockVcfRecord(pos=4, ref="ATTC", alts=["A"], chrom="Chrom_1"),
            _MockVcfRecord(pos=6, ref="A", alts=["AAC"], chrom="Chrom_2"),
        ]

        chrom_sizes = {"Chrom_1": 10, "Chrom_2": 8}
        result = SeqRegionMapper(base_records, chrom_sizes).get_map()

        expected_Chrom_1 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=3),
            SeqRegion(
                base_ref_start=4,
                pers_ref_start=4,
                length=1,
                vcf_record_ref="ATTC",
                vcf_record_alt="A",
            ),
            SeqRegion(base_ref_start=8, pers_ref_start=5, length=3),
        ]

        expected_Chrom_2 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=5),
            SeqRegion(
                base_ref_start=6,
                pers_ref_start=6,
                length=3,
                vcf_record_ref="A",
                vcf_record_alt="AAC",
            ),
            SeqRegion(base_ref_start=7, pers_ref_start=9, length=2),
        ]
        expectations = {
            "Chrom_1": expected_Chrom_1,
            "Chrom_2": expected_Chrom_2
        }
        for key in expectations:
            self.assertEqual(expectations[key], result[key])
Exemplo n.º 29
0
    def test_SiteInBetweenNonSites(self):
        """
        A test case where the variation on top of the inferred reference overlaps: a non-variant site, a variant site,
        and a non-variant site in the prg.

        What we need is for the rebased ref to include all three sites.
        """
        # base sequ: T TAT CGG
        # secondary: T G   CGG
        chrom_sizes = {"JAC": 7}
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=1, ref="TGCG", alts=["GGCT"])

        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(pos=1, ref="TTATCG", alts=["GGCT"])

        self.assertEqual(expected, result)
Exemplo n.º 30
0
    def test_SingleBaseAlt_CorrectRegion(self):
        # base sequence:      T TAT CGG
        # derived sequence:   T G   CGG
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]

        chrom_sizes = {"JAC": 7}

        result = SeqRegionMapper(base_records, chrom_sizes).get_map()

        expected = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=1,
                vcf_record_ref="TAT",
                vcf_record_alt="G",
            ),
            SeqRegion(base_ref_start=5, pers_ref_start=3, length=3),
        ]
        self.assertEqual(expected, result["JAC"])