def test_create_processed_variant_no_change(self):
     variant = self._get_sample_variant()
     header_fields = vcf_header_parser.HeaderFields({}, {})
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=False,
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     # In this mode, the only difference between the original `variant` and
     # `proc_var` should be that INFO fields are copied to `_non_alt_info` map
     # and `_alternate_datas` are filled with alternate bases information only.
     proc_var_synthetic = processed_variant.ProcessedVariant(variant)
     proc_var_synthetic._non_alt_info = {
         'A1': 'some data',
         'A2': ['data1', 'data2']
     }
     proc_var_synthetic._alternate_datas = [
         processed_variant.AlternateBaseData(a) for a in ['A', 'TT']
     ]
     self.assertEqual([proc_var_synthetic], [proc_var])
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
 def test_no_header_fields(self):
     header_fields = vcf_header_parser.HeaderFields({}, {})
     self._assert_fields_equal(
         self._generate_expected_fields(),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
 def test_bigquery_field_name_sanitize(self):
     infos = OrderedDict([
         ('_', Info('_', 1, 'String', 'desc', 'src', 'v')),
         ('_A', Info('_A', 1, 'String', 'desc', 'src', 'v')),
         ('0a', Info('0a', 1, 'String', 'desc', 'src', 'v')),
         ('A-B*C', Info('A-B*C', 1, 'String', 'desc', 'src', 'v')),
         ('I-A', Info('I-A', field_counts['A'], 'Float', 'desc', 'src',
                      'v')),
         ('OK_info_09', Format('OK_info_09', 1, 'String', 'desc'))
     ])
     formats = OrderedDict([('a^b', Format('a^b', 1, 'String', 'desc')),
                            ('OK_format_09',
                             Format('OK_format_09', 1, 'String', 'desc'))])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(alt_fields=['I_A'],
                                        call_fields=['a_b', 'OK_format_09'],
                                        info_fields=[
                                            'field__', 'field__A',
                                            'field_0a', 'A_B_C',
                                            'OK_info_09'
                                        ]),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
    def test_info_header_fields(self):
        infos = OrderedDict([
            ('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')),
            ('I2', Info('I2', 2, 'Integer', 'desc', 'src', 'v')),
            ('IA', Info('IA', field_counts['A'], 'Float', 'desc', 'src', 'v')),
            ('IU',
             Info('IU', field_counts['.'], 'Character', 'desc', 'src', 'v')),
            ('IG', Info('IG', field_counts['G'], 'String', 'desc', 'src',
                        'v')),
            ('I0', Info('I0', 0, 'Flag', 'desc', 'src', 'v')),
            ('IA2', Info('IA2', field_counts['A'], 'Float', 'desc', 'src',
                         'v')),
            (
                'END',  # END should not be included in the generated schema.
                Info('END', 1, 'Integer', 'Special END key', 'src', 'v'))
        ])
        header_fields = vcf_header_parser.HeaderFields(infos, {})

        self._assert_fields_equal(
            self._generate_expected_fields(
                alt_fields=['IA', 'IA2'],
                info_fields=['I1', 'I2', 'IU', 'IG', 'I0']),
            bigquery_vcf_schema.generate_schema_from_header_fields(
                header_fields,
                processed_variant.ProcessedVariantFactory(header_fields)))

        # Test with split_alternate_allele_info_fields=False.
        actual_schema = bigquery_vcf_schema.generate_schema_from_header_fields(
            header_fields,
            processed_variant.ProcessedVariantFactory(
                header_fields, split_alternate_allele_info_fields=False))
        self._assert_fields_equal(
            self._generate_expected_fields(
                info_fields=['I1', 'I2', 'IA', 'IU', 'IG', 'I0', 'IA2']),
            actual_schema)
        # Verify types and modes.
        expected_type_modes = {
            'I1': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_NULLABLE),
            'I2': (TableFieldConstants.TYPE_INTEGER,
                   TableFieldConstants.MODE_REPEATED),
            'IA': (TableFieldConstants.TYPE_FLOAT,
                   TableFieldConstants.MODE_REPEATED),
            'IU': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_REPEATED),
            'IG': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_REPEATED),
            'I0': (TableFieldConstants.TYPE_BOOLEAN,
                   TableFieldConstants.MODE_NULLABLE),
            'IA2':
            (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED)
        }
        for field in actual_schema.fields:
            if field.name in expected_type_modes:
                expected_type, expected_mode = expected_type_modes[field.name]
                self.assertEqual(expected_type, field.type)
                self.assertEqual(expected_mode, field.mode)
 def _get_row_list_from_variant(self, variant, **kwargs):
     # TODO(bashir2): To make this more of a "unit" test, we should create
     # ProcessedVariant instances directly (instead of Variant) and avoid calling
     # create_processed_variant here. Then we should also add cases that
     # have annotation fields.
     header_fields = vcf_header_parser.HeaderFields({}, {})
     proc_var = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant)
     return list(
         bigquery_vcf_schema.get_rows_from_variant(proc_var, **kwargs))
 def test_create_processed_variant_move_alt_info(self):
     variant = self._get_sample_variant()
     header_fields = vcf_header_parser.HeaderFields({}, {})
     factory = processed_variant.ProcessedVariantFactory(
         header_fields, split_alternate_allele_info_fields=True)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('A')
     alt1._info = {'A2': 'data1'}
     alt2 = processed_variant.AlternateBaseData('TT')
     alt2._info = {'A2': 'data2'}
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
     self.assertFalse(proc_var.non_alt_info.has_key('A2'))
 def _get_sample_variant_and_header_with_csq(self):
     variant = self._get_sample_variant()
     variant.info['CSQ'] = vcfio.VariantInfo(
         data=['A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3'],
         field_count='.')
     csq_info = parser._Info(
         id=None,
         num='.',
         type=None,
         desc='some desc Allele|Consequence|IMPACT|SYMBOL|Gene',
         source=None,
         version=None)
     header_fields = vcf_header_parser.HeaderFields(infos={'CSQ': csq_info},
                                                    formats={})
     return variant, header_fields
 def test_variant_merger_modify_schema(self):
     infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src',
                                      'v')),
                          ('IA',
                           Info('IA', field_counts['A'], 'Integer', 'desc',
                                'src', 'v'))])
     formats = OrderedDict([('F1', Format('F1', 1, 'String', 'desc'))])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(
             alt_fields=['IA'],
             call_fields=['F1'],
             info_fields=['I1', 'ADDED_BY_MERGER']),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields),
             variant_merger=_DummyVariantMergeStrategy()))
예제 #9
0
 def test_convert_variant_to_bigquery_row(self):
   variant_1, row_1 = self._get_sample_variant_1()
   variant_2, row_2 = self._get_sample_variant_2()
   variant_3, row_3 = self._get_sample_variant_3()
   header_fields = vcf_header_parser.HeaderFields({}, {})
   proc_var_1 = processed_variant.ProcessedVariantFactory(
       header_fields).create_processed_variant(variant_1)
   proc_var_2 = processed_variant.ProcessedVariantFactory(
       header_fields).create_processed_variant(variant_2)
   proc_var_3 = processed_variant.ProcessedVariantFactory(
       header_fields).create_processed_variant(variant_3)
   pipeline = TestPipeline()
   bigquery_rows = (
       pipeline
       | Create([proc_var_1, proc_var_2, proc_var_3])
       | 'ConvertToRow' >> ParDo(ConvertToBigQueryTableRow()))
   assert_that(bigquery_rows, equal_to([row_1, row_2, row_3]))
   pipeline.run()
    def _get_row_list_from_variant(self,
                                   variant,
                                   schema_descriptor=None,
                                   allow_incompatible_records=False,
                                   **kwargs):
        # TODO(bashir2): To make this more of a "unit" test, we should create
        # ProcessedVariant instances directly (instead of Variant) and avoid calling
        # create_processed_variant here. Then we should also add cases that
        # have annotation fields.
        header_fields = vcf_header_parser.HeaderFields({}, {})
        proc_var = processed_variant.ProcessedVariantFactory(
            header_fields).create_processed_variant(variant)
        if not schema_descriptor:
            schema_descriptor = mock_bigquery_schema_descriptor.MockSchemaDescriptor(
            )

        return list(
            bigquery_vcf_schema.get_rows_from_variant(
                proc_var, schema_descriptor, self._conflict_resolver,
                allow_incompatible_records, **kwargs))
 def test_info_and_format_header_fields(self):
     infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src',
                                      'v')),
                          ('IA',
                           Info('IA', field_counts['A'], 'Integer', 'desc',
                                'src', 'v'))])
     # GT and PS should not be set as they're already included in special
     # 'genotype' and 'phaseset' fields.
     formats = OrderedDict([
         ('F1', Format('F1', 1, 'String', 'desc')),
         ('F2', Format('F2', 2, 'Integer', 'desc')),
         ('FU', Format('FU', field_counts['.'], 'Float', 'desc')),
         ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
         ('PS', Format('PS', 1, 'Integer', 'Special PS key'))
     ])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(alt_fields=['IA'],
                                        call_fields=['F1', 'F2', 'FU'],
                                        info_fields=['I1']),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
 def test_create_processed_variant_annotation_alt_allele_num(self):
     csq_info = parser._Info(
         id=None,
         num='.',
         type=None,
         desc='some desc Allele|Consequence|IMPACT|ALLELE_NUM',
         source=None,
         version=None)
     header_fields = vcf_header_parser.HeaderFields(infos={'CSQ': csq_info},
                                                    formats={})
     variant = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='C',
         # The following represent a SNV and an insertion, resp.
         alternate_bases=['T', 'CT'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation
         # ALT because it can map to either the 'T' SNV or the 'CT' insertion.
         # But because there is ALLELE_NUM there should be no ambiguity.
         # The last four annotations have incorrect ALLELE_NUMs.
         info={
             'CSQ':
             vcfio.VariantInfo(data=[
                 'T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3',
                 'T|C5|I5|TEST', 'T|C6|I6|'
             ],
                               field_count='.')
         })
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         use_allele_num=True,
         minimal_match=True,  # This should be ignored by the factory method.
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('T')
     alt1._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'T',
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'ALLELE_NUM': '1'
         }]
     }
     alt2 = processed_variant.AlternateBaseData('CT')
     alt2._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'T',
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'ALLELE_NUM': '2'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)