Пример #1
0
    def test_write_vars_plus_flanks_to_fasta(self):
        '''test _write_vars_plus_flanks_to_fasta'''
        ref_seqs = {
            #                                         12345678901234567890
            'ref1': pyfastaq.sequences.Fasta('ref1', 'AGTGGATAGCTAGCTAGAGA'),
            'ref2': pyfastaq.sequences.Fasta('ref2', 'AGGAGAGAGAGAGAGAA'),
            'ref3': pyfastaq.sequences.Fasta('ref3', 'AGCTTCATAGAGAGGTTTA'),
        }

        vcf_records = {
            'ref1': [
                vcf_record.VcfRecord(
                    'ref1\t3\tid_1\tT\tC,AG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80'
                ),
                vcf_record.VcfRecord(
                    'ref1\t10\tid_2\tCT\tA\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80'
                ),
            ],
            'ref3': [
                vcf_record.VcfRecord(
                    'ref3\t4\tid_3\tT\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80'
                ),
            ],
        }

        tmp_file = 'tmp.mapping_based_verifier.write_vars_plus_flanks_to_fasta.fa'
        mapping_based_verifier.MappingBasedVerifier._write_vars_plus_flanks_to_fasta(
            tmp_file, vcf_records, ref_seqs, 3)

        expected_file = os.path.join(data_dir,
                                     'write_vars_plus_flanks_to_fasta.fa')
        self.assertTrue(filecmp.cmp(expected_file, tmp_file, shallow=False))
        os.unlink(tmp_file)
    def test_load_vcf_files(self):
        '''test _load_vcf_files'''
        vcf_file_1 = os.path.join(data_dir, 'load_vcf_files.1.vcf')

        expected_headers = {
            vcf_file_1: ['#file1 header1', '#file1 header2'],
        }

        expected_records = {
            'ref.1': [
                vcf_record.VcfRecord('ref.1\t5\tid3\tG\tA\tPASS\tSVTYPE=SNP\tGT\t1/1'),
                vcf_record.VcfRecord('ref.1\t10\tid1\tA\tT\tPASS\tSVTYPE=SNP\tGT\t1/1'),
            ],
            'ref.2': [vcf_record.VcfRecord('ref.2\t42\tid2\tG\tC\tPASS\tSVTYPE=SNP\tGT\t1/1')],
        }

        expected_sample = 'sample'
        got_sample, got_headers, got_records = vcf_clusterer.VcfClusterer._load_vcf_files([vcf_file_1])
        self.assertEqual(expected_sample, got_sample)
        self.assertEqual(expected_headers, got_headers)
        self.assertEqual(expected_records, got_records)

        vcf_file_2 = os.path.join(data_dir, 'load_vcf_files.2.vcf')
        expected_headers[vcf_file_2] = ['#file2 header', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_from_vcf_2']
        expected_records['ref.3'] = [vcf_record.VcfRecord('ref.3\t8\tid5\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1')]
        expected_records['ref.1'].insert(1, vcf_record.VcfRecord('ref.1\t8\tid4\tC\tG\tPASS\tSVTYPE=SNP\tGT\t1/1'))
        expected_sample = 'sample_from_vcf_2'
        got_sample, got_headers, got_records = vcf_clusterer.VcfClusterer._load_vcf_files([vcf_file_1, vcf_file_2])
        self.assertEqual(expected_sample, got_sample)
        self.assertEqual(expected_headers, got_headers)
        self.assertEqual(expected_records, got_records)
Пример #3
0
 def test_update_vcf_record_using_gramtools_allele_depths_homozygous(self):
     """test update_using_gramtools_allele_depths homozygous"""
     record = vcf_record.VcfRecord(
         "ref\t4\t.\tT\tTC,G\t228\t.\tINDEL;IDV=54;IMF=0.885246;DP=61;VDB=7.33028e-19;SGB=-0.693147;MQSB=0.9725;MQ0F=0;AC=2;AN=2;DP4=0,0,23,31;MQ=57\tGT:PL\t1/1:255,163,0"
     )
     allele_combination_cov = {"1": 1, "2": 80}
     allele_groups_dict = {"1": {0}, "2": {2}, "3": {1, 2}}
     allele_per_base_cov = [[1], [0, 0], [80]]
     expected = vcf_record.VcfRecord(
         "ref\t4\t.\tT\tTC,G\t.\t.\t.\tGT:DP:COV:GT_CONF\t2/2:81:1,0,80:87.29"
     )
     mean_depth = 85
     error_rate = 0.001
     got_filtered = gramtools.update_vcf_record_using_gramtools_allele_depths(
         record,
         allele_combination_cov,
         allele_per_base_cov,
         allele_groups_dict,
         mean_depth,
         error_rate,
     )
     self.assertEqual(expected, record)
     expected_filtered = vcf_record.VcfRecord(
         "ref\t4\t.\tT\tG\t.\t.\t.\tGT:DP:COV:GT_CONF\t1/1:81:1,80:87.29")
     self.assertEqual(expected_filtered, got_filtered)
Пример #4
0
    def test_vcf_file_to_dict(self):
        '''test vcf_file_to_dict'''
        expected_header = ['# header1', '# header2']
        lines = [
            'ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80',
            'ref_42\t12\tid_foo\tC\tG\t42.43\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,53:39.81',
            'ref_43\t42\tid_foo\tT\tG\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.82',
            'ref_43\t43\tid_foo\tT\tG,*\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.83',
            'ref_43\t44\tid_foo\tT\t*\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.84',
        ]

        expected_records = {
            'ref_42': [vcf_record.VcfRecord(lines[0]), vcf_record.VcfRecord(lines[1])],
            'ref_43': [vcf_record.VcfRecord(lines[x]) for x in (2, 3, 4)],
        }

        infile = os.path.join(data_dir, 'vcf_file_to_dict.vcf')
        got_header, got_records = vcf_file_read.vcf_file_to_dict(infile)
        self.assertEqual(expected_records, got_records)
        self.assertEqual(expected_header, got_header)

        infile = os.path.join(data_dir, 'vcf_file_to_dict.vcf.gz')
        got_header, got_records = vcf_file_read.vcf_file_to_dict(infile)
        self.assertEqual(expected_records, got_records)
        self.assertEqual(expected_header, got_header)


        expected_records['ref_43'].pop()
        expected_records['ref_43'][-1].remove_asterisk_alts()
        infile = os.path.join(data_dir, 'vcf_file_to_dict.vcf')
        got_header, got_records = vcf_file_read.vcf_file_to_dict(infile, remove_asterisk_alts=True)
        self.assertEqual(expected_records, got_records)
        self.assertEqual(expected_header, got_header)
Пример #5
0
    def test_VcfRecord_constructor(self):
        '''test VcfRecord constructor'''
        line = 'ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n'
        record = vcf_record.VcfRecord(line)
        self.assertEqual(record.CHROM, 'ref_42')
        self.assertEqual(record.POS, 10)
        self.assertEqual(record.ID, 'id_foo')
        self.assertEqual(record.REF, 'A')
        self.assertEqual(record.ALT, ['G'])
        self.assertEqual(record.QUAL, 42.42)
        self.assertEqual(record.FILTER, 'PASS')
        self.assertEqual(record.INFO, {
            'KMER': '31',
            'SVLEN': '0',
            'SVTYPE': 'SNP'
        })
        self.assertEqual(record.FORMAT, {
            'GT': '1/1',
            'COV': '0,52',
            'GT_CONF': '39.80'
        })

        line = 'ref_42\t11\tid_foo\tA\tG,TC\t.\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n'
        record = vcf_record.VcfRecord(line)
        self.assertEqual(record.QUAL, None)
        self.assertEqual(record.ALT, ['G', 'TC'])
Пример #6
0
 def test_make_all_variants_intervals(self):
     """test _make_all_variants_intervals"""
     variants = {
         "seq.1": [
             vcf_record.VcfRecord(
                 "seq.1\t15\t.\tAGTTGTC\tA\t.\t.\tSVTYPE=DEL"),
             vcf_record.VcfRecord("seq.1\t100\t.\tT\tA\t.\t.\tSVTYPE=SNP"),
         ],
         "seq.2":
         [vcf_record.VcfRecord("seq.1\t43\t.\tA\tACGTA\t.\t.\tSVTYPE=INS")],
     }
     big_variant_intervals = {
         "seq.1": [
             pyfastaq.intervals.Interval(9, 19),
             pyfastaq.intervals.Interval(50, 60),
         ],
         "seq.3": [pyfastaq.intervals.Interval(42, 45)],
     }
     got = dnadiff.Dnadiff._make_all_variants_intervals(
         variants, big_variant_intervals)
     expected = {
         "seq.1": [
             pyfastaq.intervals.Interval(9, 20),
             pyfastaq.intervals.Interval(50, 60),
             pyfastaq.intervals.Interval(99, 99),
         ],
         "seq.2": [pyfastaq.intervals.Interval(42, 42)],
         "seq.3": [pyfastaq.intervals.Interval(42, 45)],
     }
     self.assertEqual(expected, got)
Пример #7
0
 def test_update_vcf_record_using_gramtools_allele_depths_heterozygous(
         self):
     """test update_using_gramtools_allele_depths heterozygous"""
     record = vcf_record.VcfRecord(
         "ref\t4\t.\tT\tA,G,TC\t228\t.\tINDEL;IDV=54;IMF=0.885246;DP=61;VDB=7.33028e-19;SGB=-0.693147;MQSB=0.9725;MQ0F=0;AC=2;AN=2;DP4=0,0,23,31;MQ=57\tGT:PL\t1/1:255,163,0"
     )
     allele_combination_cov = {"1": 9, "2": 7, "3": 1}
     allele_groups_dict = {"1": {0}, "2": {2}, "3": {2, 3}}
     allele_per_base_cov = [[0], [9], [7], [1, 0]]
     expected = vcf_record.VcfRecord(
         "ref\t4\t.\tT\tA,G,TC\t.\t.\t.\tGT:DP:COV:GT_CONF\t0/2:17:9,0,7,0:54.46"
     )
     mean_depth = 15
     error_rate = 0.001
     got_filtered = gramtools.update_vcf_record_using_gramtools_allele_depths(
         record,
         allele_combination_cov,
         allele_per_base_cov,
         allele_groups_dict,
         mean_depth,
         error_rate,
     )
     self.assertEqual(expected, record)
     expected_filtered = vcf_record.VcfRecord(
         "ref\t4\t.\tT\tG\t.\t.\t.\tGT:DP:COV:GT_CONF\t0/1:17:9,7:54.46")
     self.assertEqual(expected_filtered, got_filtered)
Пример #8
0
    def test_write_vars_plus_flanks_to_fasta(self):
        """test _write_vars_plus_flanks_to_fasta"""
        ref_seqs = {
            #                                         12345678901234567890
            "ref1": pyfastaq.sequences.Fasta("ref1", "AGTGGATAGCTAGCTAGAGA"),
            "ref2": pyfastaq.sequences.Fasta("ref2", "AGGAGAGAGAGAGAGAA"),
            "ref3": pyfastaq.sequences.Fasta("ref3", "AGCTTCATAGAGAGGTTTA"),
        }

        vcf_records = {
            "ref1": [
                vcf_record.VcfRecord(
                    "ref1\t3\tid_1\tT\tC,AG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80"
                ),
                vcf_record.VcfRecord(
                    "ref1\t10\tid_2\tCT\tA\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80"
                ),
            ],
            "ref3": [
                vcf_record.VcfRecord(
                    "ref3\t4\tid_3\tT\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80"
                )
            ],
        }

        tmp_file = "tmp.mapping_based_verifier.write_vars_plus_flanks_to_fasta.fa"
        mapping_based_verifier.MappingBasedVerifier._write_vars_plus_flanks_to_fasta(
            tmp_file, vcf_records, ref_seqs, 3)

        expected_file = os.path.join(data_dir,
                                     "write_vars_plus_flanks_to_fasta.fa")
        self.assertTrue(filecmp.cmp(expected_file, tmp_file, shallow=False))
        os.unlink(tmp_file)
Пример #9
0
    def test_VcfRecord_constructor(self):
        """test VcfRecord constructor"""
        line = "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n"
        record = vcf_record.VcfRecord(line)
        self.assertEqual(record.CHROM, "ref_42")
        self.assertEqual(record.POS, 10)
        self.assertEqual(record.ID, "id_foo")
        self.assertEqual(record.REF, "A")
        self.assertEqual(record.ALT, ["G"])
        self.assertEqual(record.QUAL, 42.42)
        self.assertEqual(record.FILTER, {"PASS"})
        self.assertEqual(record.INFO, {
            "KMER": "31",
            "SVLEN": "0",
            "SVTYPE": "SNP"
        })
        self.assertEqual(record.FORMAT, {
            "GT": "1/1",
            "COV": "0,52",
            "GT_CONF": "39.80"
        })

        line = "ref_42\t11\tid_foo\tA\tG,TC\t.\tFilter1;Filter2\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n"
        record = vcf_record.VcfRecord(line)
        self.assertEqual(record.QUAL, None)
        self.assertEqual(record.ALT, ["G", "TC"])
        self.assertEqual(record.FILTER, {"Filter1", "Filter2"})

        line = "ref_42\t11\tid_foo\tA\tG,TC\t.\t.\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n"
        record = vcf_record.VcfRecord(line)
        self.assertEqual(record.FILTER, set())
Пример #10
0
    def test_add_vcf_record_and_len(self):
        '''test add_vcf_record and len'''
        record1 = vcf_record.VcfRecord(
            'ref_42\t11\tid_1\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80'
        )
        record2 = vcf_record.VcfRecord(
            'ref_42\t12\tid_2\tC\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80'
        )
        record3 = vcf_record.VcfRecord(
            'ref_42\t15\tid_2\tC\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80'
        )
        record4 = vcf_record.VcfRecord(
            'ref_42\t19\tid_2\tCCCCC\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80'
        )
        record5 = vcf_record.VcfRecord(
            'ref_42\t23\tid_2\tC\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80'
        )

        cluster = vcf_record_cluster.VcfRecordCluster(
            max_distance_between_variants=3)
        self.assertEqual(0, len(cluster))
        self.assertTrue(cluster.add_vcf_record(record1))
        self.assertEqual(1, len(cluster))
        self.assertTrue(cluster.add_vcf_record(record2))
        self.assertEqual(2, len(cluster))
        self.assertTrue(cluster.add_vcf_record(record3))
        self.assertEqual(3, len(cluster))
        self.assertFalse(cluster.add_vcf_record(record4))
        self.assertEqual(3, len(cluster))
        cluster.max_distance_between_variants = 5
        self.assertTrue(cluster.add_vcf_record(record4))
        self.assertEqual(4, len(cluster))
        self.assertTrue(cluster.add_vcf_record(record5))
        self.assertEqual(5, len(cluster))
Пример #11
0
def test_vcf_records_make_same_allele_combination():
    ref_seqs = {"ref1": "GCTGT"}
    record1 = vcf_record.VcfRecord("ref1\t1\t.\tGCT\tGC,GCGT\t.\t.\t.")
    record2 = vcf_record.VcfRecord("ref1\t5\t.\tT\tTGG,G\t.\t.\t.")
    record3 = vcf_record.VcfRecord("ref2\t5\t.\tT\tTGG,G\t.\t.\t.")
    assert variant_tracking.vcf_records_make_same_allele_combination(
        record1, record2, ref_seqs)
    assert not variant_tracking.vcf_records_make_same_allele_combination(
        record1, record3, ref_seqs)
Пример #12
0
 def test_total_coverage(self):
     """test total_coverage"""
     record = vcf_record.VcfRecord(
         "ref\t3\tid_foo\tC\tA\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:GT_CONF\t1/1:39.80\n"
     )
     self.assertEqual(None, record.total_coverage())
     record = vcf_record.VcfRecord(
         "ref\t3\tid_foo\tC\tA\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:1,2,39:39.80\n"
     )
     self.assertEqual(42, record.total_coverage())
Пример #13
0
 def test_to_record_per_alt(self):
     """test to_record_per_alt"""
     vcf = vcf_record.VcfRecord("ref\t42\t.\tA\tC\t.\tPASS\tSVTYPE=SNP\n")
     self.assertEqual([vcf], vcf.to_record_per_alt())
     vcf = vcf_record.VcfRecord(
         "ref\t42\t.\tA\tC,TC\t.\tPASS\tSVTYPE=SNP\n")
     vcf_c = vcf_record.VcfRecord("ref\t42\t.\tA\tC\t.\tPASS\tSVTYPE=SNP\n")
     vcf_tc = vcf_record.VcfRecord(
         "ref\t42\t.\tA\tTC\t.\tPASS\tSVTYPE=SNP\n")
     self.assertEqual([vcf_c, vcf_tc], vcf.to_record_per_alt())
Пример #14
0
 def test_to_record_per_alt(self):
     '''test to_record_per_alt'''
     vcf = vcf_record.VcfRecord('ref\t42\t.\tA\tC\t.\tPASS\tSVTYPE=SNP\n')
     self.assertEqual([vcf], vcf.to_record_per_alt())
     vcf = vcf_record.VcfRecord(
         'ref\t42\t.\tA\tC,TC\t.\tPASS\tSVTYPE=SNP\n')
     vcf_c = vcf_record.VcfRecord('ref\t42\t.\tA\tC\t.\tPASS\tSVTYPE=SNP\n')
     vcf_tc = vcf_record.VcfRecord(
         'ref\t42\t.\tA\tTC\t.\tPASS\tSVTYPE=SNP\n')
     self.assertEqual([vcf_c, vcf_tc], vcf.to_record_per_alt())
Пример #15
0
def test_vcf_file_to_dict():
    vcf_file = os.path.join(data_dir, "vcf_file_to_dict.vcf")
    expect = {
        "ref1":
        [vcf_record.VcfRecord("ref1\t42\t1\tT\tA\t.\tPASS\t.\tGT\t1/1")],
        "ref2": [
            vcf_record.VcfRecord("ref2\t43\t3\tT\tA,C\t.\tPASS\t.\tGT\t2/2"),
            vcf_record.VcfRecord("ref2\t44\t2\tT\tA\t.\tPASS\t.\tGT\t1/1"),
        ],
    }
    got = recall._vcf_file_to_dict(vcf_file)
    assert got == expect
Пример #16
0
    def test_load_vcf_files(self):
        """test _load_vcf_files"""
        vcf_file_1 = os.path.join(data_dir, "load_vcf_files.1.vcf")

        expected_headers = {vcf_file_1: ["#file1 header1", "#file1 header2"]}

        expected_records = {
            "ref.1": [
                vcf_record.VcfRecord(
                    "ref.1\t5\tid3\tG\tA\tPASS\tSVTYPE=SNP\tGT\t1/1"),
                vcf_record.VcfRecord(
                    "ref.1\t10\tid1\tA\tT\tPASS\tSVTYPE=SNP\tGT\t1/1"),
            ],
            "ref.2": [
                vcf_record.VcfRecord(
                    "ref.2\t42\tid2\tG\tC\tPASS\tSVTYPE=SNP\tGT\t1/1")
            ],
        }

        expected_sample = "sample"
        (
            got_sample,
            got_headers,
            got_records,
        ) = vcf_clusterer.VcfClusterer._load_vcf_files([vcf_file_1], None)
        self.assertEqual(expected_sample, got_sample)
        self.assertEqual(expected_headers, got_headers)
        self.assertEqual(expected_records, got_records)

        vcf_file_2 = os.path.join(data_dir, "load_vcf_files.2.vcf")
        expected_headers[vcf_file_2] = [
            "#file2 header",
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_from_vcf_2",
        ]
        expected_records["ref.3"] = [
            vcf_record.VcfRecord(
                "ref.3\t8\tid5\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1")
        ]
        expected_records["ref.1"].insert(
            1,
            vcf_record.VcfRecord(
                "ref.1\t8\tid4\tC\tG\tPASS\tSVTYPE=SNP\tGT\t1/1"))
        expected_sample = "sample_from_vcf_2"
        (
            got_sample,
            got_headers,
            got_records,
        ) = vcf_clusterer.VcfClusterer._load_vcf_files(
            [vcf_file_1, vcf_file_2], None)
        self.assertEqual(expected_sample, got_sample)
        self.assertEqual(expected_headers, got_headers)
        self.assertEqual(expected_records, got_records)
Пример #17
0
    def test_record_with_zero_pos_valueerror_raised(self):
        ref_seq = 'AGCTATCTGCGTATTCGATC'
        record_1 = vcf_record.VcfRecord(
            'ref\t0\t.\tC\tCG\t42.42\tPASS\tSVTPYPE=INDEL\tGT\t1/1')
        record_2 = vcf_record.VcfRecord(
            'ref\t1\t.\tT\tA\t42.42\tPASS\tSVTPYPE=SNP\tGT\t1/1')

        cluster = vcf_record_cluster.VcfRecordCluster(
            vcf_record=record_1, max_distance_between_variants=1)
        cluster.add_vcf_record(record_2)

        with self.assertRaises(ValueError):
            cluster.make_one_merged_vcf_record_for_gramtools(ref_seq)
Пример #18
0
    def test_str(self):
        """test __str__"""
        line = "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80"
        record = vcf_record.VcfRecord(line)
        self.assertEqual(line, str(record))

        line = "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\t."
        record = vcf_record.VcfRecord(line)
        self.assertEqual(line, str(record))

        line = "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tFOO;KMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80"
        record = vcf_record.VcfRecord(line)
        self.assertEqual(line, str(record))
Пример #19
0
 def test_GT_always_printed_first_when_present(self):
     """test GT always printed first when present"""
     line1 = "ref\t11\tid\tA\tG\t42.0\tPASS\t.\tGT:COV:GT_CONF\t1/1:0,52:39.80"
     line2 = "ref\t11\tid\tA\tG\t42.0\tPASS\t.\tCOV:GT:GT_CONF\t0,52:1/1:39.80"
     vcf = vcf_record.VcfRecord(line1)
     self.assertEqual(line1, str(vcf))
     vcf = vcf_record.VcfRecord(line2)
     self.assertEqual(line1, str(vcf))
     line3 = "ref\t11\tid\tA\tG\t42.0\tPASS\t.\tCOV:GT_CONF\t0,52:39.80"
     vcf = vcf_record.VcfRecord(line3)
     self.assertEqual(line3, str(vcf))
     vcf.set_format_key_value("GT", "1/1")
     self.assertEqual(line1, str(vcf))
Пример #20
0
 def test_remove_asterisk_alts(self):
     '''test remove_asterisk_alts'''
     record = vcf_record.VcfRecord(
         'ref.3\t8\tid5\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1')
     record.remove_asterisk_alts()
     self.assertEqual(['G'], record.ALT)
     record = vcf_record.VcfRecord(
         'ref.3\t8\tid5\tA\tG,*\tPASS\tSVTYPE=SNP\tGT\t1/1')
     record.remove_asterisk_alts()
     self.assertEqual(['G'], record.ALT)
     record = vcf_record.VcfRecord(
         'ref.3\t8\tid5\tA\t*\tPASS\tSVTYPE=SNP\tGT\t1/1')
     record.remove_asterisk_alts()
     self.assertEqual([], record.ALT)
Пример #21
0
    def test_merge(self):
        """test merge"""
        ref_seq = pyfastaq.sequences.Fasta("ref", "AGCTAGGTCAG")
        record1 = vcf_record.VcfRecord("wrong_ref\t3\t.\tC\tA\t228\t.\t.\t.")
        record2 = vcf_record.VcfRecord(
            "ref\t1\t.\tAG\tGAA\t228\t.\tINDEL;IDV=54;IMF=0.885246;DP=61;VDB=7.33028e-19;SGB=-0.693147;MQSB=0.9725;MQ0F=0;AC=2;AN=2;DP4=0,0,23,31;MQ=57\tGT:PL\t1/1:255,163,0"
        )
        record3 = vcf_record.VcfRecord(
            "ref\t2\t.\tG\tAA\t228\t.\tINDEL;IDV=54;IMF=0.885246;DP=61;VDB=7.33028e-19;SGB=-0.693147;MQSB=0.9725;MQ0F=0;AC=2;AN=2;DP4=0,0,23,31;MQ=57\tGT:PL\t1/1:255,163,0"
        )
        record4 = vcf_record.VcfRecord(
            "ref\t3\t.\tC\tCAT\t21.4018\t.\tINDEL;IDV=2;IMF=0.0338983;DP=59;VDB=0.18;SGB=-0.453602;MQ0F=0;AC=2;AN=2;DP4=0,0,0,2;MQ=60\tGT:PL\t1/1:48,6,0"
        )
        record5 = vcf_record.VcfRecord("ref\t7\t.\tG\tC\t21.4018\t.\t.\t.\t.")

        self.assertIsNone(record1.merge(record2, ref_seq))
        self.assertIsNone(record2.merge(record1, ref_seq))
        self.assertIsNone(record1.merge(record3, ref_seq))
        self.assertIsNone(record3.merge(record1, ref_seq))

        got = record3.merge(record4, ref_seq)
        expected = vcf_record.VcfRecord(
            "ref\t2\t.\tGC\tAACAT\t.\t.\tSVTYPE=MERGED\tGT\t1/1")
        self.assertEqual(expected, got)
        got = record4.merge(record3, ref_seq)
        expected = vcf_record.VcfRecord(
            "ref\t2\t.\tGC\tAACAT\t.\t.\tSVTYPE=MERGED\tGT\t1/1")
        self.assertEqual(expected, got)

        got = record4.merge(record5, ref_seq)
        expected = vcf_record.VcfRecord(
            "ref\t3\t.\tCTAGG\tCATTAGC\t.\t.\tSVTYPE=MERGED\tGT\t1/1")
        self.assertEqual(expected, got)
Пример #22
0
 def test_intersects(self):
     """test intersects"""
     record1 = vcf_record.VcfRecord("ref_42\t11\t.\tA\tG\t42.42\tPASS\t.")
     record2 = vcf_record.VcfRecord("ref_42\t12\t.\tC\tT\t42.42\tPASS\t.")
     record3 = vcf_record.VcfRecord("ref_43\t12\t.\tC\tT\t42.42\tPASS\t.")
     record4 = vcf_record.VcfRecord("ref_42\t11\t.\tCT\tT\t42.42\tPASS\t.")
     self.assertTrue(record1.intersects(record1))
     self.assertTrue(record2.intersects(record2))
     self.assertFalse(record1.intersects(record2))
     self.assertFalse(record2.intersects(record1))
     self.assertFalse(record3.intersects(record2))
     self.assertFalse(record2.intersects(record3))
     self.assertTrue(record2.intersects(record4))
     self.assertTrue(record4.intersects(record2))
Пример #23
0
 def test_remove_asterisk_alts(self):
     """test remove_asterisk_alts"""
     record = vcf_record.VcfRecord(
         "ref.3\t8\tid5\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1")
     record.remove_asterisk_alts()
     self.assertEqual(["G"], record.ALT)
     record = vcf_record.VcfRecord(
         "ref.3\t8\tid5\tA\tG,*\tPASS\tSVTYPE=SNP\tGT\t1/1")
     record.remove_asterisk_alts()
     self.assertEqual(["G"], record.ALT)
     record = vcf_record.VcfRecord(
         "ref.3\t8\tid5\tA\t*\tPASS\tSVTYPE=SNP\tGT\t1/1")
     record.remove_asterisk_alts()
     self.assertEqual([], record.ALT)
Пример #24
0
 def test_ref_string_matches_dict_of_ref_sequences(self):
     """test ref_string_matches_dict_of_ref_sequences"""
     record = vcf_record.VcfRecord(
         "ref1\t3\t.\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1")
     ref_seqs = {"ref1": "GTACG", "ref2": "TTTTT"}
     self.assertTrue(
         record.ref_string_matches_dict_of_ref_sequences(ref_seqs))
     record = vcf_record.VcfRecord(
         "ref2\t3\t.\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1")
     self.assertFalse(
         record.ref_string_matches_dict_of_ref_sequences(ref_seqs))
     record = vcf_record.VcfRecord(
         "ref3\t3\t.\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1")
     self.assertFalse(
         record.ref_string_matches_dict_of_ref_sequences(ref_seqs))
Пример #25
0
 def test_start_and_end(self):
     '''test start_and_end'''
     cluster = vcf_record_cluster.VcfRecordCluster(
         max_distance_between_variants=3)
     self.assertEqual((None, None), cluster.start_and_end())
     record1 = vcf_record.VcfRecord(
         'ref_42\t11\tid_1\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80'
     )
     self.assertTrue(cluster.add_vcf_record(record1))
     self.assertEqual((10, 10), cluster.start_and_end())
     record2 = vcf_record.VcfRecord(
         'ref_42\t12\tid_2\tC\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80'
     )
     self.assertTrue(cluster.add_vcf_record(record2))
     self.assertEqual((10, 11), cluster.start_and_end())
Пример #26
0
    def test_get_total_length_of_expected_regions_called(self):
        '''test _get_total_length_of_expected_regions_called'''
        expected_regions = {
            'ref.1': [
                pyfastaq.intervals.Interval(101,
                                            200),  # 100 long, 92 get called
                pyfastaq.intervals.Interval(251,
                                            260),  # 10 long, none get called
            ],
            'ref.2': [
                pyfastaq.intervals.Interval(42, 43),  # 2 long, none get called
            ],
        }

        called_vcf_records = {
            'ref.1': [
                vcf_record.VcfRecord(
                    'ref.1\t100\t.\tACGTACTGTA\tA,G\t42.0\t.\tDP4=42\tGT\t2/2'
                ),
            ],
        }

        got_all, got_called = mapping_based_verifier.MappingBasedVerifier._get_total_length_of_expected_regions_called(
            expected_regions, called_vcf_records)
        self.assertEqual(112, got_all)
        self.assertEqual(8, got_called)
Пример #27
0
 def test_ref_string_matches_ref_sequence(self):
     """test ref_string_matches_ref_sequence"""
     record = vcf_record.VcfRecord(
         "ref_name\t1\t.\tAGT\tG\tPASS\tSVTYPE=SNP\tGT\t1/1")
     self.assertFalse(record.ref_string_matches_ref_sequence("AG"))
     record = vcf_record.VcfRecord(
         "ref_name\t3\t.\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1")
     self.assertTrue(record.ref_string_matches_ref_sequence("GCATG"))
     self.assertFalse(record.ref_string_matches_ref_sequence("GCxTG"))
     record = vcf_record.VcfRecord(
         "ref_name\t3\t.\tAGT\tG\tPASS\tSVTYPE=SNP\tGT\t1/1")
     self.assertTrue(record.ref_string_matches_ref_sequence("GCAGT"))
     self.assertFalse(record.ref_string_matches_ref_sequence("GCAGC"))
     self.assertFalse(record.ref_string_matches_ref_sequence("GCAG"))
     self.assertFalse(record.ref_string_matches_ref_sequence("GCA"))
     self.assertFalse(record.ref_string_matches_ref_sequence("GA"))
Пример #28
0
    def make_separate_indels_and_one_alt_with_all_snps_no_combinations(self, ref_seq):
        '''Returns a VCF record, where each indel from this
        cluster is in a separate ALT. Then all the remaining SNPs are
        applied to make one ALT. If >1 SNP in same place, either one
        might be used'''
        final_start_position = min([x.POS for x in self.vcf_records])
        final_end_position = max([x.ref_end_pos() for x in self.vcf_records])
        snps = []
        new_vcf_records = []

        for record in self.vcf_records:
            if record.is_snp():
                snps.append(copy.copy(record))
            else:
                new_record = copy.copy(record)
                new_record.add_flanking_seqs(ref_seq, final_start_position, final_end_position)
                new_vcf_records.append(new_record)

        if len(snps):
            new_record = copy.copy(snps[0])
            for snp in snps[1:]:
                merged = new_record.merge(snp, ref_seq)
                if merged is not None:
                    new_record = merged
            new_record.add_flanking_seqs(ref_seq, final_start_position, final_end_position)
            new_vcf_records.append(new_record)

        alts = ','.join([x.ALT[0] for x in new_vcf_records])
        new_record = vcf_record.VcfRecord('\t'.join([self.vcf_records[0].CHROM, str(final_start_position + 1), '.', new_vcf_records[0].REF, alts, '.', 'PASS', '.']))

        return new_record
    def _variant_cluster_to_vcf_line(self, variants, variant_ids, max_alleles=None):
        if len(variants) == 0:
            return None
        ref_seq = self.ref_seqs[self.ref_seq_names[variants[0].seq_id]]
        if logging.getLogger().level <= logging.DEBUG:
            logging.debug(f"Clustering variants:")
            for v in variants:
                logging.debug(f"  {ref_seq.id}\t{v.pos}\t{v.ref}\t{v.alt}")
        start, end, alts = allele_combinations.var_cluster_to_coords_and_alts(
            variants, ref_seq, max_alleles=max_alleles
        )
        info_field = "."
        if alts is None:
            alts = set()
            var_id_to_var = dict(zip(variant_ids, variants))
            var_patterns = var_patterns_from_block_slices(
                self.var_block_tabixes, var_id_to_var, variants[0].seq_id, start, end
            )
            for var_pattern in var_patterns:
                alt_alleles = var_pattern_to_alleles(
                    var_id_to_var,
                    var_pattern,
                    ref_seq,
                    start,
                    end,
                    max_alleles=max_alleles,
                )
                if alt_alleles is None:
                    logging.warning("Conflicting allele combination:")
                    for var_id in sorted(list(var_pattern)):
                        var = var_id_to_var[var_id]
                        logging.warning(
                            f"  {ref_seq.id} {var.pos+1} {var.ref} {var.alt}"
                        )
                else:
                    alts.update(alt_alleles)

            info_field = "High_variability"

        if len(alts) == 0:
            logging.warning("Could not make VCF record from these variants:")
            for variant in variants:
                logging.warning("    " + str(variant))
            return None
        else:
            return vcf_record.VcfRecord(
                "\t".join(
                    [
                        ref_seq.id,
                        str(start + 1),
                        ".",
                        ref_seq[start : end + 1],
                        ",".join(sorted(list(alts))),
                        ".",
                        "PASS",
                        info_field,
                    ]
                )
            )
Пример #30
0
    def test_ref_end_pos(self):
        """test ref_end_pos"""
        line = "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n"
        record = vcf_record.VcfRecord(line)
        self.assertEqual(10, record.ref_end_pos())

        line = "ref_42\t11\tid_foo\tAA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n"
        record = vcf_record.VcfRecord(line)
        self.assertEqual(11, record.ref_end_pos())

        line = "ref_42\t11\tid_foo\tAAG\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n"
        record = vcf_record.VcfRecord(line)
        self.assertEqual(12, record.ref_end_pos())

        line = "ref_42\t11\tid_foo\t.\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n"
        record = vcf_record.VcfRecord(line)
        self.assertEqual(10, record.ref_end_pos())