def test_SNP_OnTopOfIndel(self): """ A test case where we find a SNP on top of an insertion in the inferred reference. What we need is for the rebased alt to include the flanking alt bases, which are implied to be present in the discov_record. """ # base sequ: T TAT CGG T A # secondary: T G CGG TCTGC A chrom_sizes = {"JAC": 9} base_records = [ _MockVcfRecord(pos=2, ref="TAT", alts=["G"]), _MockVcfRecord(pos=8, ref="T", alts=["TCTGC"]), ] mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map() region_searcher = SearchableSeqRegionsMap(mapped_regions) discov_record = _MockVcfRecord(pos=9, ref="G", alts=["A"]) new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC", region_searcher) result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref, new_vcf_record.alts) expected = _MockVcfRecord(8, "T", ["TCTAC"]) self.assertEqual(expected, result)
def test_TwoRecords_CorrectRegions(self): # base sequence: T TAT C G G # derived sequence: T GCCAC C TTT G base_records = [ _MockVcfRecord(pos=2, ref="TAT", alts=["GCCAC"]), _MockVcfRecord(pos=6, ref="G", alts=["TTT"]), ] chrom_sizes = {"JAC": 7} result = SeqRegionMapper(base_records, chrom_sizes).get_map() expected = [ SeqRegion(base_ref_start=1, pers_ref_start=1, length=1), SeqRegion( base_ref_start=2, pers_ref_start=2, length=5, vcf_record_ref="TAT", vcf_record_alt="GCCAC", ), SeqRegion(base_ref_start=5, pers_ref_start=7, length=1), SeqRegion( base_ref_start=6, pers_ref_start=8, length=3, vcf_record_ref="G", vcf_record_alt="TTT", ), SeqRegion(base_ref_start=7, pers_ref_start=11, length=1), ] self.assertEqual(expected, result["JAC"])
def test_chrom_with_no_records(self): """ Need to map chroms with no initial variation too """ base_records = [ _MockVcfRecord(pos=2, ref="T", alts=["A"], chrom="Chrom_2") ] chrom_sizes = {"Chrom_1": 4, "Chrom_2": 5} result = SeqRegionMapper(base_records, chrom_sizes).get_map() expected_Chrom_1 = [ SeqRegion(base_ref_start=1, pers_ref_start=1, length=4) ] expected_Chrom_2 = [ SeqRegion(base_ref_start=1, pers_ref_start=1, length=1), SeqRegion( base_ref_start=2, pers_ref_start=2, length=1, vcf_record_ref="T", vcf_record_alt="A", ), SeqRegion(base_ref_start=3, pers_ref_start=3, length=3), ] expectations = { "Chrom_1": expected_Chrom_1, "Chrom_2": expected_Chrom_2 } for key in expectations: self.assertEqual(expectations[key], result[key])
def test_multiple_deletions(self): """ A test case where we discover a deletion on top of a deletion in a variant site; as well as an extra deletion in a non-variant site. There is also a SNP among the original deletion, to make it plausible that quasimap/infer picks this variant. To make it harder, the discovered variation is also reported inside a variant site, so we expect the rebased alt to be elongated. We expect the rebased ref to include all deleted bases. """ # base reference: CAA C GCTA CAA # inferred reference: C C GAT CAA chrom_sizes = {"JAC": 11} base_records = [ _MockVcfRecord(pos=1, ref="CAA", alts=["C"]), _MockVcfRecord(pos=5, ref="GCTA", alts=["GAT"]), ] mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map() region_searcher = SearchableSeqRegionsMap(mapped_regions) discov_record = _MockVcfRecord(pos=4, ref="ATC", alts=["A"]) new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC", region_searcher) result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref, new_vcf_record.alts) expected = _MockVcfRecord(pos=5, ref="GCTAC", alts=["GA"]) self.assertEqual(expected, result)
def test_NoRecords(self): """ Imagining a vcf from `infer` has no records, `build` would not have succeeded in the first place, having not built a prg given no variants. """ # base sequence: TTATCGG # derived sequence: TTATCGG chrom_sizes = {} base_records = [] with self.assertRaises(ValueError): result = SeqRegionMapper(base_records, chrom_sizes).get_map()
def test_ref_call_produces_invariant_region_only(self): # base sequence: T TAT CGG # derived sequence: ^^^^^^^^^ base_records = [ _MockVcfRecord(pos=2, ref="TAT", alts=["G"], samples=[{ "GT": [0] }]) ] chrom_sizes = {"JAC": 7} result = SeqRegionMapper(base_records, chrom_sizes).get_map() expected = [SeqRegion(base_ref_start=1, pers_ref_start=1, length=7)] self.assertEqual(expected, result["JAC"])
def test_ThreeAdjacentRecords_CorrectRegions(self): # base sequence: T TAT C G G # derived sequence: T GCCAC TCT AA G base_records = [ _MockVcfRecord(pos=2, ref="TAT", alts=["GCCAC"]), _MockVcfRecord(pos=5, ref="C", alts=["TCT"]), _MockVcfRecord(pos=6, ref="G", alts=["AA"]), ] chrom_sizes = {"JAC": 7} result = SeqRegionMapper(base_records, chrom_sizes).get_map() expected = [ SeqRegion(base_ref_start=1, pers_ref_start=1, length=1), SeqRegion( base_ref_start=2, pers_ref_start=2, length=5, vcf_record_ref="TAT", vcf_record_alt="GCCAC", ), SeqRegion( base_ref_start=5, pers_ref_start=7, length=3, vcf_record_ref="C", vcf_record_alt="TCT", ), SeqRegion( base_ref_start=6, pers_ref_start=10, length=2, vcf_record_ref="G", vcf_record_alt="AA", ), SeqRegion(base_ref_start=7, pers_ref_start=12, length=1), ] self.assertEqual(expected, list(result.values())[0])
def _make_rebasing_map(geno_paths: GenotypePaths): """ Produces a mapping object supporting coordinate translation between the original reference (that the genotyped vcf uses as REF) and the gramtools-induced personalised reference. This can be used to translate points in either reference coordinate space to the other. Used in `discover` for rebasing newly found variants against the original reference. """ chrom_sizes: ChromSizes = common.load_fasta(geno_paths.pers_ref, sizes_only=True) base_records = VariantFile(geno_paths.geno_vcf).fetch() region_map: SeqRegionsMap = SeqRegionMapper(base_records, chrom_sizes).get_map() SearchableSeqRegionsMap(region_map).dump_to( geno_paths.rebasing_map, dump_sequences=False )
def _rebase_vcf(disco_paths: DiscoverPaths, check_records=True): """Rebase a vcf so that it uses same reference as base_vcf. (* for not an input/output, just for illustration) Input: discovery.vcf personalised_ref.vcf | | personalised_ref.fasta *base_ref.fasta Output: discovery.vcf | *base_ref.fasta """ if check_records: var_unplaced_records = [] inferred_refs = load_fasta(disco_paths.pers_ref) _add_contig_lines(disco_paths) base_records = VariantFile(disco_paths.geno_vcf).fetch() derived_records = VariantFile(disco_paths.discov_vcf_cortex).fetch() # Not loading genotype-produced rebasing map here, because it lacks the sequences chrom_sizes: ChromSizes = load_fasta(disco_paths.pers_ref, sizes_only=True) region_map: SeqRegionsMap = SeqRegionMapper(base_records, chrom_sizes).get_map() region_searcher = SearchableSeqRegionsMap(region_map) new_vcf_records = [] for vcf_record in derived_records: chrom_key = vcf_record.chrom if check_records: if not check_ref_consistent( vcf_record, inferred_refs[chrom_key], var_unplaced_records ): continue # Do not process inconsistent records new_vcf_records.append( _rebase_vcf_record(vcf_record, chrom_key, region_searcher) ) if check_records and len(var_unplaced_records) > 0: log.warning( f"{len(var_unplaced_records)} new variant records were skipped, " f"because record pos and ref do not coincide with personalised reference" ) log.debug("Skipped records: {}".format("\n".join(var_unplaced_records))) return new_vcf_records
def test_StartsAtNonSite_EndsAtSite(self): # base sequence: T TAT CGG # derived sequence: T G CGG chrom_sizes = {"JAC": 7} base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])] mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map() region_searcher = SearchableSeqRegionsMap(mapped_regions) discov_record = _MockVcfRecord(pos=1, ref="TG", alts=["TAA"]) new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC", region_searcher) result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref, new_vcf_record.alts) expected = _MockVcfRecord(1, "TTAT", ["TAA"]) self.assertEqual(expected, result)
def test_SingleSNPInNonSite(self): # base sequence: T TAT CGG # derived sequence: T G CGG chrom_sizes = {"JAC": 5} base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])] region_map = SeqRegionMapper(base_records, chrom_sizes).get_map() region_searcher = SearchableSeqRegionsMap(region_map) discov_record = _MockVcfRecord(pos=3, ref="C", alts=["G"]) new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC", region_searcher) result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref, new_vcf_record.alts) expected = _MockVcfRecord(pos=5, ref="C", alts=["G"]) self.assertEqual(expected, result)
def test_variant_in_chromo_with_no_prg_variants(self): # chr1 base: T TAT CGG # chr1 derived: T G CGG # chr2 base: TTTTT # chr2 derived: TTTTT chrom_sizes = {"chr1": 7, "chr2": 5} base_records = [ _MockVcfRecord(pos=2, ref="TAT", alts=["G"], chrom="chr1") ] mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map() region_searcher = SearchableSeqRegionsMap(mapped_regions) discov_record = _MockVcfRecord(pos=1, ref="TT", alts=["GA"], chrom="chr2") new_vcf_record = discover._rebase_vcf_record(discov_record, "chr2", region_searcher) self.assertEqual(discov_record, new_vcf_record)
def test_SingleBaseAlt_CorrectRegion(self): # base sequence: T TAT CGG # derived sequence: T G CGG base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])] chrom_sizes = {"JAC": 7} result = SeqRegionMapper(base_records, chrom_sizes).get_map() expected = [ SeqRegion(base_ref_start=1, pers_ref_start=1, length=1), SeqRegion( base_ref_start=2, pers_ref_start=2, length=1, vcf_record_ref="TAT", vcf_record_alt="G", ), SeqRegion(base_ref_start=5, pers_ref_start=3, length=3), ] self.assertEqual(expected, result["JAC"])
def test_TwoRecords_TwoDifferentChroms(self): base_records = [ _MockVcfRecord(pos=4, ref="ATTC", alts=["A"], chrom="Chrom_1"), _MockVcfRecord(pos=6, ref="A", alts=["AAC"], chrom="Chrom_2"), ] chrom_sizes = {"Chrom_1": 10, "Chrom_2": 8} result = SeqRegionMapper(base_records, chrom_sizes).get_map() expected_Chrom_1 = [ SeqRegion(base_ref_start=1, pers_ref_start=1, length=3), SeqRegion( base_ref_start=4, pers_ref_start=4, length=1, vcf_record_ref="ATTC", vcf_record_alt="A", ), SeqRegion(base_ref_start=8, pers_ref_start=5, length=3), ] expected_Chrom_2 = [ SeqRegion(base_ref_start=1, pers_ref_start=1, length=5), SeqRegion( base_ref_start=6, pers_ref_start=6, length=3, vcf_record_ref="A", vcf_record_alt="AAC", ), SeqRegion(base_ref_start=7, pers_ref_start=9, length=2), ] expectations = { "Chrom_1": expected_Chrom_1, "Chrom_2": expected_Chrom_2 } for key in expectations: self.assertEqual(expectations[key], result[key])
def test_SiteInBetweenNonSites(self): """ A test case where the variation on top of the inferred reference overlaps: a non-variant site, a variant site, and a non-variant site in the prg. What we need is for the rebased ref to include all three sites. """ # base sequ: T TAT CGG # secondary: T G CGG chrom_sizes = {"JAC": 7} base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])] mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map() region_searcher = SearchableSeqRegionsMap(mapped_regions) discov_record = _MockVcfRecord(pos=1, ref="TGCG", alts=["GGCT"]) new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC", region_searcher) result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref, new_vcf_record.alts) expected = _MockVcfRecord(pos=1, ref="TTATCG", alts=["GGCT"]) self.assertEqual(expected, result)
def make_map(base_records: MockVcfRecords, chrom_sizes: List[int]) -> SearchableSeqRegionsMap: names = [f"chr{i}" for i in range(len(chrom_sizes))] named_chroms = dict(zip(names, chrom_sizes)) region_map = SeqRegionMapper(base_records, named_chroms).get_map() return SearchableSeqRegionsMap(region_map)