Пример #1
0
    def test_read_VCF_line(self):
        with open(os.path.join(self.data_dir, "vcf_example.vcf"), "r") as vcf_file:
            vcf_handler = VCFReader(vcf_file)
            vcf_handler.read_header()
            self.assertEqual(len(vcf_handler.header.file_metadata), 7)
            self.assertEqual(len(vcf_handler.header.samples), 2)

            records = list(vcf_handler.read_records())
            self.assertEqual(len(records), 2)

            # test first record fully
            self.variant_is_equal(records[0], ("20", 9, set(), "CT", "C"))  # zero=based representation
            self.assertEqual(records[0].filters, set())
            self.assertEqual(records[0].passes_filter, True)

            self.assertEqual(len(records[0].info), 12)
            self.assertEqual(records[0].info["PP"], [3000])
            self.assertEqual(records[0].info["DP"], [250])
            self.assertEqual(records[0].info["DPR"], [140])
            self.assertEqual(records[0].info["DPF"], [110])
            self.assertEqual(records[0].info["VC"], [100])
            self.assertEqual(records[0].info["VCR"], [49])
            self.assertEqual(records[0].info["VCF"], [51])
            self.assertEqual(records[0].info["ABPV"], [0.2])
            self.assertEqual(records[0].info["SBPV"], [0.3])
            self.assertEqual(records[0].info["MQ"], [70])
            self.assertEqual(records[0].info["BR"], [31])
            self.assertEqual(records[0].info["QD"], [None])

            self.assertEqual(records[0].samples, ['sample1', 'sample2'])
            self.assertEqual(records[0].sample_info.get_field('sample1', "GT"), GenotypeCall("0/1"))
            self.assertEqual(records[0].sample_info.get_field('sample2', "GT"), GenotypeCall("1/1"))

            self.assertEqual(records[0].sample_info.get_field('sample1', 'PL'), [3000, 0, 3000])
            self.assertEqual(records[0].sample_info.get_field('sample2', 'PL'), [114, 0, 0])

            self.assertEqual(records[0].sample_info.get_field('sample1', 'GQ'), [1000])
            self.assertEqual(records[0].sample_info.get_field('sample2', 'GQ'), [None])

            # check that ordering in the dictionaries is preserved
            expected_keys = ["PP", "DP", "DPR", "DPF", "VC", "VCR",
                             "VCF", "ABPV", "SBPV", "MQ", "BR", "QD"]

            self.assertEqual(list(records[0].info.keys()), expected_keys)

            # ensure last record is still being read correctly
            self.variant_is_equal(records[-1], ("20", 10, set(), "T", "G"))
Пример #2
0
    def test_should_fail_on_unexpected_EOF(self):
        lines = [
            '##fileformat=VCFv4.2\n',
        ]
        reader = VCFReader(iter(lines))

        with self.assertRaisesRegex(Exception, 'unexpected EOF'):
            print(reader.read_header())
Пример #3
0
    def test_should_fail_if_column_header_line_is_missing(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            'the line after the header\n',
        ]
        reader = VCFReader(iter(lines))

        with self.assertRaisesRegex(Exception, 'expected column header line: \'the line after the header\''):
            print(reader.read_header())
Пример #4
0
    def test_should_fail_if_version_is_not_defined(self):
        lines = [
            '##notFileformat=foo\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        with self.assertRaisesRegex(Exception, 'unrecognised file format line: \'##notFileformat=foo\''):
            print(reader.read_header())
Пример #5
0
    def test_should_fail_with_unexpected_version(self):
        lines = [
            '##fileformat=VCFv0.0\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        with self.assertRaisesRegex(Exception, 'unexpected version: \'0.0\''):
            print(reader.read_header())
Пример #6
0
    def test_should_store_header_as_attribute_of_parser(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        self.assertEqual(header, reader.header)
Пример #7
0
    def test_should_fail_to_parse_malformed_header_line(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##malformed line!\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        with self.assertRaisesRegex(Exception, 'failed to parse header line: \'##malformed line!\''):
            print(reader.read_header())
Пример #8
0
    def test_should_parse_well_formatted_version(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        self.assertEqual(expected, header)
Пример #9
0
    def test_should_parse_column_headers_with_format_but_no_samples(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        self.assertEqual(expected, header)
Пример #10
0
    def test_should_fail_with_malformed_format_column_header(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFOO\n',
        ]
        reader = VCFReader(iter(lines))

        with self.assertRaisesRegex(
                Exception,
                re.escape('expected column header line: \'#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFOO\'')
        ):
            print(reader.read_header())
Пример #11
0
    def test_should_parse_column_headers_with_complex_sample_names(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tOWEN_TOBY-RHYS.JONES\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.samples = ['OWEN_TOBY-RHYS.JONES']
        self.assertEqual(expected, header)
Пример #12
0
    def test_should_fail_without_required_column_headers(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\n',
        ]
        reader = VCFReader(iter(lines))

        with self.assertRaisesRegex(
                Exception,
                re.escape("expected column header line: '#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER'")
        ):
            print(reader.read_header())
Пример #13
0
    def test_should_parse_valid_filter_header_fields(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##FILTER=<ID=key,Description="description">\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.set_filter('key', 'description')
        self.assertEqual(expected, header)
Пример #14
0
    def test_should_parse_well_formatted_file_metadata(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##fileDate=2013-07-08\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.file_metadata['fileDate'] = '2013-07-08'
        self.assertEqual(expected, header)
Пример #15
0
    def test_should_parse_valid_contig_header_fields(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##contig=<ID=key,length=666>\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.set_contig('key', 666)
        self.assertEqual(expected, header)
Пример #16
0
    def test_should_parse_valid_sample_header_fields(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##FORMAT=<ID=key,Number=1,Type=String,Description="description">\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.set_sample_data('key', '1', 'String', 'description')
        self.assertEqual(expected, header)
Пример #17
0
    def test_should_not_parse_column_headers_with_sample_names_containing_white_space(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tOWEN JONES\n',
        ]
        reader = VCFReader(iter(lines))

        with self.assertRaisesRegex(
                Exception,
                re.escape(
                    'expected column header line: '
                    '\'#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tOWEN JONES\''
                )
        ):
            print(reader.read_header())
Пример #18
0
    def test_should_parse_all_info_header_fields(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##INFO=<ID=key,Number=1,Type=String,Description="description",Source="foo",Version="bar">\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.set_info_data(
            'key',
            '1',
            'String',
            'description',
            'foo',
            'bar')
        self.assertEqual(expected, header)