示例#1
0
    def test_doesnt_give_a_flying_damn_about_spurious_filter_header(self):
        chrom = "22"
        variant = Variant(chrom, 11, "A", "C")

        schema = Schema()
        complex_filter_name = '.+-*\\/~@?!%^&><=\"\'(){}[]_|'
        schema.set_filter(complex_filter_name, 'unusual characters')

        gv_builder = VCFBuilder(join(self.work_dir, "genotype.vcf"),
                                schema=schema)
        gv_builder.with_record_from_variant(variant,
                                            filters={complex_filter_name})
        gv_builder.build().index()
        driver = SVCDriver(self)

        dodgy_sample = "bobs_your_uncle"
        driver.with_ref_sequence(
            "ACGCCCCCTGCAAAAAAAAAA", chrom=chrom, pos_from=0).with_read(
                "...........C.........",
                n_fwd=5,
                n_rev=5,
                chrom=chrom,
                sample_name=dodgy_sample).with_genotype_alleles(
                    gv_builder.compressed_filename)

        expect = driver.call(expected_success=True)
        expect .with_output_vcf()\
            .has_record_for_variant(variant)\
            .with_sample(dodgy_sample)\
            .has_genotype("1/1")
示例#2
0
 def __init__(self, filename, schema=None):
     self.__filename = filename
     self.__indexer = TabixIndexer(self.__filename, "vcf")
     if schema is None:
         self.schema = Schema()
     else:
         self.schema = schema
     self.__records = []
示例#3
0
 def test_should_return_true_if_no_GL_or_PL_present(self):
     schema = Schema()
     schema.set_sample_data('GT', '1', 'String', '')
     schema.samples = ['foo']
     records = list(
         generate_records(
             schema,
             ['chrZ', '200', '.', 'C', 'A', '.', 'PASS', '.', 'GT', '0/1']))
     self.assertTrue(records[0].sample_info.has_no_likelihoods())
示例#4
0
    def test_should_write_filter_in_expected_format(self):
        mock_file = StringIO()
        schema = Schema()
        schema.set_filter('key', 'a filter')

        writer = VCFWriter(mock_file)
        writer.write_header(schema)

        expected_file = '##fileformat=VCFv4.2\n' \
                        '##FILTER=<ID=key,Description="a filter">\n' \
                        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'
        self.assertEqual(expected_file, mock_file.getvalue())
示例#5
0
    def test_should_write_sample_data_in_expected_format(self):
        mock_file = StringIO()
        schema = Schema()
        schema.set_sample_data('key', '1', 'String', 'a sample field')

        writer = VCFWriter(mock_file)
        writer.write_header(schema)

        expected_file = '##fileformat=VCFv4.2\n' \
                        '##FORMAT=<ID=key,Number=1,Type=String,Description="a sample field">\n' \
                        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'
        self.assertEqual(expected_file, mock_file.getvalue())
示例#6
0
    def test_should_parse_column_headers_with_complex_sample_names(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tOWEN_TOBY-RHYS.JONES\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.samples = ['OWEN_TOBY-RHYS.JONES']
        self.assertEqual(expected, header)
示例#7
0
    def test_should_write_contig_in_expected_format(self):
        mock_file = StringIO()
        schema = Schema()
        schema.set_contig('key', 666)

        writer = VCFWriter(mock_file)
        writer.write_header(schema)

        expected_file = '##fileformat=VCFv4.2\n' \
                        '##contig=<ID=key,length=666>\n' \
                        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'
        self.assertEqual(expected_file, mock_file.getvalue())
示例#8
0
 def test_should_format_multiple_values(self):
     schema = Schema()
     schema.set_info_data('K', 'A', 'Float', 'K')
     schema.set_info_data('K2', 'A', 'String', 'K')
     schema.set_info_data('K3', '0', 'Flag', 'K')
     schema.set_info_data('K4', 'A', 'String', 'K')
     info_data = InfoData(schema, {
         'K3': None,
         'K2': ['S2'],
         'K': [1.0, 2.66, 3.0],
         'K4': ['S4']
     })
     self.assertEqual('K=1.0,2.66,3.0;K2=S2;K3;K4=S4', info_data.to_vcf())
示例#9
0
    def test_should_parse_valid_contig_header_fields(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##contig=<ID=key,length=666>\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.set_contig('key', 666)
        self.assertEqual(expected, header)
示例#10
0
    def test_should_parse_valid_filter_header_fields(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##FILTER=<ID=key,Description="description">\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.set_filter('key', 'description')
        self.assertEqual(expected, header)
示例#11
0
    def test_should_parse_valid_sample_header_fields(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##FORMAT=<ID=key,Number=1,Type=String,Description="description">\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.set_sample_data('key', '1', 'String', 'description')
        self.assertEqual(expected, header)
示例#12
0
 def test_should_warn_about_too_many_alts_in_field_of_allelic_cardinality(
         self,
         log):
     schema = Schema()
     schema.set_info_data('key', 'A', 'String', '')
     records = list(generate_records(schema, [
         'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', 'key=a,b,c'
     ]))
     expected = [['a'], ['b']]
     for index, record in enumerate(records):
         self.assertEqual(expected[index], record.info['key'])
     log.check(('wecall.vcfutils.fieldmetadata', 'WARNING',
                'expected 2 items in {!r}'.format([['a'], ['b'], ['c']])), )
示例#13
0
    def test_should_parse_well_formatted_file_metadata(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##fileDate=2013-07-08\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.file_metadata['fileDate'] = '2013-07-08'
        self.assertEqual(expected, header)
示例#14
0
    def test_should_write_file_metadata_in_expected_format(self):
        mock_file = StringIO()
        date = datetime.datetime.utcnow().strftime('%F')
        schema = Schema()
        schema.file_metadata['fileDate'] = date

        writer = VCFWriter(mock_file)
        writer.write_header(schema)

        expected_file = '##fileformat=VCFv4.2\n' \
                        '##fileDate={date!s}\n' \
                        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' \
            .format(date=date)
        self.assertEqual(expected_file, mock_file.getvalue())
示例#15
0
 def test_should_warn_when_GT_is_not_present(self, log):
     schema = Schema()
     schema.set_sample_data('GL', 'G', 'Float', '')
     schema.samples = ['foo']
     records = list(
         generate_records(schema, [
             'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', '.', 'GL', '1,2,3'
         ]))
     for index, record in enumerate(records):
         self.assertEqual(
             (index, ['1', '2', '3']),
             (index, record.sample_info.get_field('foo', 'GL')))
     log.check(('wecall.vcfutils.fieldmetadata', 'WARNING',
                'Unknown ploidy when parsing genotype likelihood'), )
示例#16
0
 def test_should_write_empty_file_containing_expected_version_number(self):
     mock_file = StringIO()
     empty_schema = Schema()
     writer = VCFWriter(mock_file)
     writer.write_header(empty_schema)
     expected_file = '##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'
     self.assertEqual(expected_file, mock_file.getvalue())
示例#17
0
 def __enter__(self):
     self.fp = open(self.filename, 'w')
     if self.header is None:
         self.header = Schema()
         self.header.file_metadata['fileDate'] = datetime.date.today(
         ).strftime('%F')
     self.vcf_writer = VCFWriter(self.fp)
     self.vcf_writer.write_header(self.header)
     return self.vcf_writer
示例#18
0
    def test_should_parse_all_info_header_fields(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '##INFO=<ID=key,Number=1,Type=String,Description="description",Source="foo",Version="bar">\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        expected.set_info_data(
            'key',
            '1',
            'String',
            'description',
            'foo',
            'bar')
        self.assertEqual(expected, header)
示例#19
0
    def test_should_add_default_parsing_rule_for_unknown_key_in_multiallelic_line(self):
        schema = Schema()
        records = list(generate_records(schema, [
            'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', 'NEW_KEY=value'
        ]))

        self.assertEqual(0, len(list(schema.iter_info_data())))
        for index, record in enumerate(records):
            self.assertEqual(
                (index, ['value']), (index, record.info['NEW_KEY']))
        self.assertEqual(1, len(list(schema.iter_info_data())))

        info_metadata = schema.get_info_data('NEW_KEY')
        self.assertEqual('.', info_metadata.number)
        self.assertEqual('String', info_metadata.data_type)
        self.assertEqual(
            'Inferred from file content during parsing',
            info_metadata.description)
        self.assertEqual('vcfutils', info_metadata.source)
        self.assertEqual('undefined', info_metadata.version)
示例#20
0
    def test_should_parse_column_headers_with_format_but_no_samples(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        self.assertEqual(expected, header)
示例#21
0
    def test_should_parse_well_formatted_version(self):
        lines = [
            '##fileformat=VCFv4.2\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n',
        ]
        reader = VCFReader(iter(lines))

        header = reader.read_header()

        expected = Schema()
        self.assertEqual(expected, header)
示例#22
0
    def test_should_write_sample_names_in_column_header_line(self):
        mock_file = StringIO()
        schema = Schema()
        schema.samples.append('FOO')

        writer = VCFWriter(mock_file)
        writer.write_header(schema)

        expected_file = '##fileformat=VCFv4.2\n' \
                        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tFOO\n'
        self.assertEqual(expected_file, mock_file.getvalue())
示例#23
0
 def test_should_warn_about_unrecognised_key_in_monoallelic_line(self, log):
     records = list(generate_records(Schema(), [
         'chrZ', '200', '.', 'C', 'T', '.', 'PASS', 'NEW_KEY=value'
     ]))
     for index, record in enumerate(records):
         self.assertEqual(
             (index, ['value']), (index, record.info['NEW_KEY']))
     log.check(
         ('root',
          'WARNING',
          'info field {!r} not defined in schema'.format('NEW_KEY')),
     )
示例#24
0
 def test_should_return_false_if_one_sample_okay_for_GL(self):
     schema = Schema()
     schema.set_sample_data('GT', '1', 'String', '')
     schema.set_sample_data('GL', 'G', 'Float', '')
     schema.samples = ['foo']
     records = list(
         generate_records(schema, [
             'chrZ', '200', '.', 'C', 'A', '.', 'PASS', '.', 'GT:GL',
             '0/1:90,1,120', '0/1:.,.,.'
         ]))
     self.assertFalse(records[0].sample_info.has_no_likelihoods())
示例#25
0
 def test_should_return_true_if_all_likelihoods_are_none_for_PL(self):
     schema = Schema()
     schema.set_sample_data('GT', '1', 'String', '')
     schema.set_sample_data('PL', 'G', 'Float', '')
     schema.samples = ['foo']
     records = list(
         generate_records(schema, [
             'chrZ', '200', '.', 'C', 'A', '.', 'PASS', '.', 'GT:PL',
             '0/1:.,.,.'
         ]))
     self.assertTrue(records[0].sample_info.has_no_likelihoods())
示例#26
0
    def test_should_drop_genotype_likelihood_with_mismatch_ploidy(self):
        schema = Schema()
        schema.set_sample_data('GT', '1', 'String', '')
        schema.set_sample_data('GL', 'G', 'Float', '')
        schema.samples = ['foo']
        records = list(
            generate_records(schema, [
                'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', '.', 'GT:GL',
                '0/1:1,2,3,4'
            ]))

        self.assertEqual(GenotypeCall("0/1"),
                         records[0].sample_info.get_field('foo', 'GT'))
        self.assertEqual([None, None, None],
                         records[0].sample_info.get_field('foo', 'GL'))
        self.assertEqual(GenotypeCall("0/0"),
                         records[1].sample_info.get_field('foo', 'GT'))
        self.assertEqual([None, None, None],
                         records[1].sample_info.get_field('foo', 'GL'))
示例#27
0
    def test_should_split_genotype_likelihood_properly(self):
        schema = Schema()
        schema.set_sample_data('GT', '1', 'String', '')
        schema.set_sample_data('GL', 'G', 'Float', '')
        schema.samples = ['foo']
        records = list(
            generate_records(schema, [
                'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', '.', 'GT:GL',
                '0/1:1,2,3,4,5,6'
            ]))

        self.assertEqual(GenotypeCall("0/1"),
                         records[0].sample_info.get_field('foo', 'GT'))
        self.assertEqual([1.0, 2.0, 3.0],
                         records[0].sample_info.get_field('foo', 'GL'))
        self.assertEqual(GenotypeCall("0/0"),
                         records[1].sample_info.get_field('foo', 'GT'))
        self.assertEqual([1.0, 4.0, 6.0],
                         records[1].sample_info.get_field('foo', 'GL'))
示例#28
0
 def test_should_format_a_string_list(self):
     schema = Schema()
     schema.set_info_data('K', 'A', 'String', 'K')
     info_data = InfoData(schema, {'K': ['V1', 'V2']})
     self.assertEqual('K=V1,V2', info_data.to_vcf())
示例#29
0
 def test_should_format_an_int_list(self):
     schema = Schema()
     schema.set_info_data('K', 'A', 'Integer', 'K')
     info_data = InfoData(schema, {'K': [1, 2, 3]})
     self.assertEqual('K=1,2,3', info_data.to_vcf())
示例#30
0
 def test_should_format_a_float_list(self):
     schema = Schema()
     schema.set_info_data('K', 'A', 'Integer', 'K')
     info_data = InfoData(schema, {'K': [1.0, 2.66, 3.0]})
     self.assertEqual('K=1.0,2.66,3.0', info_data.to_vcf())