def _get_sample_variant_and_header_with_csq(self, additional_infos=None):
        """Provides a simple `Variant` and `VcfHeader` with info fields

    Args:
      additional_infos: A list of tuples of the format (key, `Info`) to be added
        to the `VcfHeader`.
    """
        # type:  (
        variant = self._get_sample_variant()
        variant.info['CSQ'] = [
            'A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3'
        ]
        infos = OrderedDict([
            ('A1', Info('A1', 1, None, '', None, None)),
            ('A2', Info('A2', parser.field_counts['A'], None, '', None, None)),
            ('CSQ',
             Info('CSQ', parser.field_counts['.'], None,
                  'some desc Allele|Consequence|IMPACT|SYMBOL|Gene', None,
                  None))
        ])
        if additional_infos is not None:
            for key, value in additional_infos:
                infos[key] = value
        header_fields = vcf_header_io.VcfHeader(infos=infos)
        return variant, header_fields
示例#2
0
    def test_infer_annotation_pipeline(self):
        anno_fields = ['CSQ']
        header = self._get_sample_header_fields(with_annotation=True)
        variant1 = self._get_sample_variant_1()
        variant1.info['CSQ'] = [
            'A|1|100|1.2', 'A|2|101|1.3', 'A|12|start|0', 'TT|13|end|7'
        ]
        variant2 = self._get_sample_variant_1()
        variant2.info['CSQ'] = [
            'A|1|100|', 'A|2|101|', 'A|1.2|102|0', 'TT|1.3|103|7'
        ]
        desc = 'Inferred type field for annotation {}.'
        expected = vcf_header_io.VcfHeader(
            infos={
                'CSQ_Gene_TYPE':
                Info('CSQ_Gene_TYPE', 1, 'Float', desc.format('Gene'), '', ''),
                'CSQ_Position_TYPE':
                Info('CSQ_Position_TYPE', 1, 'String', desc.format('Position'),
                     '', ''),
                'CSQ_Score_TYPE':
                Info('CSQ_Score_TYPE', 1, 'Float', desc.format('Score'), '',
                     '')
            })

        with TestPipeline() as p:
            inferred_headers = (
                p
                | Create([variant1, variant2])
                | 'InferAnnotationTypes' >> infer_headers.InferHeaderFields(
                    defined_headers=header,
                    infer_headers=False,
                    annotation_fields_to_infer=anno_fields))
            assert_that(inferred_headers,
                        asserts.header_fields_equal_ignore_order([expected]))
            p.run()
 def test_bigquery_field_name_sanitize(self):
     infos = OrderedDict([
         ('_', Info('_', 1, 'String', 'desc', 'src', 'v')),
         ('_A', Info('_A', 1, 'String', 'desc', 'src', 'v')),
         ('0a', Info('0a', 1, 'String', 'desc', 'src', 'v')),
         ('A-B*C', Info('A-B*C', 1, 'String', 'desc', 'src', 'v')),
         ('I-A', Info('I-A', field_counts['A'], 'Float', 'desc', 'src',
                      'v')),
         ('OK_info_09', Format('OK_info_09', 1, 'String', 'desc'))
     ])
     formats = OrderedDict([('a^b', Format('a^b', 1, 'String', 'desc')),
                            ('OK_format_09',
                             Format('OK_format_09', 1, 'String', 'desc'))])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(alt_fields=['I_A'],
                                        call_fields=['a_b', 'OK_format_09'],
                                        info_fields=[
                                            'field__', 'field__A',
                                            'field_0a', 'A_B_C',
                                            'OK_info_09'
                                        ]),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
    def test_header_fields_inferred_from_two_variants(self):
        with TestPipeline() as p:
            variant_1 = self._get_sample_variant_1()
            variant_2 = self._get_sample_variant_2()
            inferred_headers = (
                p
                | Create([variant_1, variant_2])
                | 'InferUndefinedHeaderFields' >> infer_undefined_headers.
                InferUndefinedHeaderFields(defined_headers=None))

            expected_infos = {
                'IS': Info('IS', 1, 'String', '', '', ''),
                'IF': Info('IF', 0, 'Flag', '', '', ''),
                'IA': Info('IA', None, 'Float', '', '', ''),
                'IS_2': Info('IS_2', 1, 'String', '', '', '')
            }
            expected_formats = {
                'FI': Format('FI', 1, 'Integer', ''),
                'FU': Format('FU', None, 'Float', ''),
                'FI_2': Format('FI_2', 1, 'Integer', '')
            }

            expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                               formats=expected_formats)
            assert_that(inferred_headers, equal_to([expected]))
            p.run()
 def _get_sample_header_fields(self):
   infos = OrderedDict([
       ('IS', Info('I1', 1, 'String', 'desc', 'src', 'v')),
       ('IF', Info('I1', 1, 'Flag', 'desc', 'src', 'v')),
       ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))])
   formats = OrderedDict([
       ('FS', Format('FS', 1, 'String', 'desc')),
       ('FI', Format('FI', 2, 'Integer', 'desc')),
       ('FU', Format('FU', field_counts['.'], 'Float', 'desc')),
       ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
       ('PS', Format('PS', 1, 'Integer', 'Special PS key'))])
   return vcf_header_io.VcfHeader(infos=infos, formats=formats)
 def test_create_alt_bases_field_schema_types(self):
     ids = [
         'CSQ_Allele_TYPE', 'CSQ_Consequence_TYPE', 'CSQ_IMPACT_TYPE',
         'CSQ_SYMBOL_TYPE'
     ]
     types = ['String', 'Integer', 'Integer', 'Float']
     infos = [(i, Info(i, 1, t, '', None, None))
              for i, t in zip(ids, types)]
     _, header_fields = self._get_sample_variant_and_header_with_csq(
         additional_infos=infos)
     for hfi in header_fields.infos.values():
         if hfi['type'] is None:
             hfi['type'] = 'String'
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'])
     schema = factory.create_alt_bases_field_schema()
     csq_field = [field for field in schema.fields
                  if field.name == 'CSQ'][0]
     expected_name_type_map = {
         'CSQ': 'RECORD',
         'allele': 'STRING',
         'Consequence': 'INTEGER',
         'IMPACT': 'INTEGER',
         'SYMBOL': 'FLOAT',
         'Gene': 'STRING'
     }
     for field in csq_field.fields:
         self.assertEqual(field.type, expected_name_type_map[field.name])
示例#7
0
    def _infer_undefined_info_fields(self, variant, defined_headers):
        """Returns info fields not defined in the headers.

    Args:
      variant (:class:`vcfio.Variant`): variant obj.
      defined_headers (:class:`vcf_header_io.VcfHeader`): header fields defined
        in header section of VCF files.
    Returns:
      A dict of (info_key(str), :class:`Info`) for any info field in `variant`
      that is not defined in the header.
    """
        infos = {}
        for info_field_key, variant_info in variant.info.iteritems():
            info_field_value = variant_info.data
            if not defined_headers or info_field_key not in defined_headers.infos:
                if info_field_key in infos:
                    raise ValueError(
                        'Invalid VCF file. Duplicate INFO field in variant {}'.
                        format(variant))
                infos[info_field_key] = Info(
                    info_field_key,
                    self._get_field_count(info_field_value),
                    self._get_field_type(info_field_value),
                    '',  # NO_DESCRIPTION
                    '',  # UNKNOWN_SOURCE
                    '')  # UNKNOWN_VERSION
        return infos
  def test_report_multiple_files(self):
    header_definitions = merge_header_definitions.VcfHeaderDefinitions()
    header_definitions._infos = {
        'NS': {Definition(1, 'Float'): ['file1', 'file2'],
               Definition(1, 'Integer'): ['file3']}
    }

    infos = OrderedDict([
        ('NS', Info('NS', 1, 'Float', 'Number samples', None, None))])
    resolved_headers = VcfHeader(infos=infos)

    expected = [
        preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n',
        preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n',
        (preprocess_reporter._DELIMITER).join([
            'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n']),
        (preprocess_reporter._DELIMITER).join([
            ' ', ' ', ' ', 'file2', ' \n']),
        (preprocess_reporter._DELIMITER).join([
            ' ', ' ', 'num=1 type=Integer', 'file3', ' \n']),
        '\n'
    ]
    self._generate_report_and_assert_contents_equal(expected,
                                                    header_definitions,
                                                    resolved_headers)
示例#9
0
    def _infer_annotation_type_info_fields(self, variant, infos,
                                           defined_headers):
        # type: (vcfio.Variant, Dict[str, Info], vcf_header_io.VcfHeader) -> None
        """Updates `infos` with inferred annotation type info fields.

    All annotation headers in each annotation field are converted to Info header
    lines where the new ID corresponds to the given annotation field and header,
    and the new TYPE corresponds to inferred type of the original header. Since
    each variant potentially contains multiple values for each annotation
    header, a small 'merge' of value types is performed before VcfHeader
    creation for each variant.
    Args:
      variant: variant object
      infos: dict of (info_key, `Info`) for any info field in
        `variant` that is not defined in the header or the definition mismatches
        the field values.
      defined_headers: header fields defined in header section of VCF files.
    """
        def _check_annotation_lists_lengths(names, values):
            lengths = set(len(v) for v in values)
            lengths.add(len(names))
            if len(lengths) != 1:
                error = (
                    'Annotation lists have inconsistent lengths: {}.\nnames={}\n'
                    'values={}').format(lengths, names, values)
                raise ValueError(error)

        resolver = vcf_field_conflict_resolver.FieldConflictResolver(
            resolve_always=True)
        for field in self._annotation_fields_to_infer:
            if field not in variant.info:
                continue
            annotation_names = annotation_parser.extract_annotation_names(
                defined_headers.infos[field][_HeaderKeyConstants.DESC])
            # First element (ALT) is ignored, since its type is hard-coded as string
            annotation_values = [
                annotation_parser.extract_annotation_list_with_alt(annotation)
                [1:] for annotation in variant.info[field]
            ]
            _check_annotation_lists_lengths(annotation_names,
                                            annotation_values)
            annotation_values = zip(*annotation_values)
            for name, values in zip(annotation_names, annotation_values):
                variant_merged_type = None
                for v in values:
                    if not v:
                        continue
                    variant_merged_type = resolver.resolve_attribute_conflict(
                        _HeaderKeyConstants.TYPE, variant_merged_type,
                        self._get_field_type(v))
                    if variant_merged_type == _HeaderTypeConstants.STRING:
                        break
                key_id = get_inferred_annotation_type_header_key(field, name)
                infos[key_id] = Info(
                    key_id,
                    1,  # field count
                    variant_merged_type,
                    ('Inferred type field for annotation {}.'.format(name)),
                    '',  # UNKNOWN_SOURCE
                    '')  # UNKNOWN_VERSION
示例#10
0
 def test_infer_annotation_types_with_multiple_annotation_fields(self):
     anno_fields = ['CSQ', 'CSQ_VT']
     csq_vt = [
         ('CSQ_VT',
          Info('CSQ_VT', -1, 'String',
               'Annotations from VEP. Format: Allele|Gene|Position|Score',
               'source', 'v'))
     ]
     header = self._get_sample_header_fields(with_annotation=csq_vt)
     variant = self._get_sample_variant_1()
     variant.info['CSQ_VT'] = ['A|1|100|1.2', 'A|2|101|1.3']
     variant.info['CSQ'] = ['A|1|100|1.2', 'A|2|101|1.3']
     infer_header_fields = infer_headers._InferHeaderFields(
         False, anno_fields)
     inferred_headers = next(infer_header_fields.process(variant, header))
     expected_types = {
         'CSQ_Gene_TYPE': 'Integer',
         'CSQ_Position_TYPE': 'Integer',
         'CSQ_Score_TYPE': 'Float',
         'CSQ_VT_Gene_TYPE': 'Integer',
         'CSQ_VT_Position_TYPE': 'Integer',
         'CSQ_VT_Score_TYPE': 'Float'
     }
     for key, item in inferred_headers.infos.iteritems():
         self.assertEqual(item['type'], expected_types[key])
     self.assertEqual(len(expected_types), len(inferred_headers.infos))
  def test_report_conflicted_and_inferred_headers(self):
    header_definitions = merge_header_definitions.VcfHeaderDefinitions()
    header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1'],
                                        Definition(1, 'Integer'): ['file2']}}

    infos = OrderedDict([
        ('NS', Info('NS', 1, 'Float', 'Number samples', None, None))])
    formats = OrderedDict([
        ('DP', Format('DP', 2, 'Float', 'Total Depth'))])
    resolved_headers = VcfHeader(infos=infos, formats=formats)
    inferred_headers = VcfHeader(formats=formats)
    expected = [
        preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n',
        preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n',
        (preprocess_reporter._DELIMITER).join([
            'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n']),
        (preprocess_reporter._DELIMITER).join([
            ' ', ' ', 'num=1 type=Integer', 'file2', ' \n']),
        '\n',
        preprocess_reporter._InconsistencyType.UNDEFINED_HEADERS + '\n',
        preprocess_reporter._HeaderLine.UNDEFINED_FIELD_HEADER + '\n',
        (preprocess_reporter._DELIMITER).join([
            'DP', 'FORMAT', 'num=2 type=Float\n']),
        '\n'
    ]
    self._generate_report_and_assert_contents_equal(expected,
                                                    header_definitions,
                                                    resolved_headers,
                                                    inferred_headers)
    def test_info_header_fields(self):
        infos = OrderedDict([
            ('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')),
            ('I2', Info('I2', 2, 'Integer', 'desc', 'src', 'v')),
            ('IA', Info('IA', field_counts['A'], 'Float', 'desc', 'src', 'v')),
            ('IU',
             Info('IU', field_counts['.'], 'Character', 'desc', 'src', 'v')),
            ('IG', Info('IG', field_counts['G'], 'String', 'desc', 'src',
                        'v')),
            ('I0', Info('I0', 0, 'Flag', 'desc', 'src', 'v')),
            ('IA2', Info('IA2', field_counts['A'], 'Float', 'desc', 'src',
                         'v')),
            (
                'END',  # END should not be included in the generated schema.
                Info('END', 1, 'Integer', 'Special END key', 'src', 'v'))
        ])
        header_fields = vcf_header_parser.HeaderFields(infos, {})

        self._assert_fields_equal(
            self._generate_expected_fields(
                alt_fields=['IA', 'IA2'],
                info_fields=['I1', 'I2', 'IU', 'IG', 'I0']),
            bigquery_vcf_schema.generate_schema_from_header_fields(
                header_fields,
                processed_variant.ProcessedVariantFactory(header_fields)))

        # Test with split_alternate_allele_info_fields=False.
        actual_schema = bigquery_vcf_schema.generate_schema_from_header_fields(
            header_fields,
            processed_variant.ProcessedVariantFactory(
                header_fields, split_alternate_allele_info_fields=False))
        self._assert_fields_equal(
            self._generate_expected_fields(
                info_fields=['I1', 'I2', 'IA', 'IU', 'IG', 'I0', 'IA2']),
            actual_schema)
        # Verify types and modes.
        expected_type_modes = {
            'I1': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_NULLABLE),
            'I2': (TableFieldConstants.TYPE_INTEGER,
                   TableFieldConstants.MODE_REPEATED),
            'IA': (TableFieldConstants.TYPE_FLOAT,
                   TableFieldConstants.MODE_REPEATED),
            'IU': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_REPEATED),
            'IG': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_REPEATED),
            'I0': (TableFieldConstants.TYPE_BOOLEAN,
                   TableFieldConstants.MODE_NULLABLE),
            'IA2':
            (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED)
        }
        for field in actual_schema.fields:
            if field.name in expected_type_modes:
                expected_type, expected_mode = expected_type_modes[field.name]
                self.assertEqual(expected_type, field.type)
                self.assertEqual(expected_mode, field.mode)
 def test_variant_merger_modify_schema(self):
     infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src',
                                      'v')),
                          ('IA',
                           Info('IA', field_counts['A'], 'Integer', 'desc',
                                'src', 'v'))])
     formats = OrderedDict([('F1', Format('F1', 1, 'String', 'desc'))])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(
             alt_fields=['IA'],
             call_fields=['F1'],
             info_fields=['I1', 'ADDED_BY_MERGER']),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields),
             variant_merger=_DummyVariantMergeStrategy()))
示例#14
0
    def _get_sample_header_fields(self, with_annotation=False):
        """Provides a simple `VcfHeader` with info and format fields

    Args:
      with_annotation: Can be bool or list of tuples. Tuples should be
        additional annotation fields in the format (key, `Info`).
    """
        infos = OrderedDict([
            ('IS', Info('I1', 1, 'String', 'desc', 'src', 'v')),
            ('ISI', Info('ISI', 1, 'Int', 'desc', 'src', 'v')),
            ('ISF', Info('ISF', 1, 'Float', 'desc', 'src', 'v')),
            ('IF', Info('IF', 1, 'Float', 'desc', 'src', 'v')),
            ('IB', Info('I1', 1, 'Flag', 'desc', 'src', 'v')),
            ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src',
                        'v'))
        ])
        if with_annotation:
            infos['CSQ'] = Info(
                'CSQ', field_counts['.'], 'String',
                'Annotations from VEP. Format: Allele|Gene|Position|Score',
                'src', 'v')
            if isinstance(with_annotation, list):
                for key, value in with_annotation:
                    infos[key] = value
        formats = OrderedDict([
            ('FS', Format('FS', 1, 'String', 'desc')),
            ('FI', Format('FI', 2, 'Integer', 'desc')),
            ('FU', Format('FU', field_counts['.'], 'Float', 'desc')),
            ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
            ('PS', Format('PS', 1, 'Integer', 'Special PS key'))
        ])
        return vcf_header_io.VcfHeader(infos=infos, formats=formats)
  def test_pipeline(self):
    infos = {'IS': Info('IS', 1, 'String', '', '', ''),
             'ISI': Info('ISI', 1, 'Integer', '', '', ''),
             'ISF': Info('ISF', 1, 'Float', '', '', ''),
             'IB': Info('IB', 0, 'Flag', '', '', ''),
             'IA': Info('IA', -1, 'Integer', '', '', '')}
    formats = OrderedDict([
        ('FS', Format('FS', 1, 'String', 'desc')),
        ('FI', Format('FI', 2, 'Integer', 'desc')),
        ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
        ('PS', Format('PS', 1, 'Integer', 'Special PS key'))])

    with TestPipeline() as p:
      variant_1 = self._get_sample_variant_info_ia_cardinality_mismatch()
      variant_2 = self._get_sample_variant_format_fi_float_value()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferHeaderFields' >>
          infer_headers.InferHeaderFields(
              defined_headers=vcf_header_io.VcfHeader(infos=infos,
                                                      formats=formats),
              allow_incompatible_records=True))

      expected_infos = {'IA': Info('IA', None, 'Float', '', '', ''),
                        'IF': Info('IF', 1, 'Float', '', '', '')}
      expected_formats = {'FI': Format('FI', 2, 'Float', 'desc'),
                          'FU': Format('FU', None, 'Float', '')}
      expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                         formats=expected_formats)
      assert_that(inferred_headers,
                  asserts.header_fields_equal_ignore_order([expected]))
      p.run()
 def test_info_and_format_header_fields(self):
     infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src',
                                      'v')),
                          ('IA',
                           Info('IA', field_counts['A'], 'Integer', 'desc',
                                'src', 'v'))])
     # GT and PS should not be set as they're already included in special
     # 'genotype' and 'phaseset' fields.
     formats = OrderedDict([
         ('F1', Format('F1', 1, 'String', 'desc')),
         ('F2', Format('F2', 2, 'Integer', 'desc')),
         ('FU', Format('FU', field_counts['.'], 'Float', 'desc')),
         ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
         ('PS', Format('PS', 1, 'Integer', 'Special PS key'))
     ])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(alt_fields=['IA'],
                                        call_fields=['F1', 'F2', 'FU'],
                                        info_fields=['I1']),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
  def test_report_no_conflicts(self):
    header_definitions = merge_header_definitions.VcfHeaderDefinitions()
    header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1']}}
    header_definitions._formats = {'NS': {Definition(1, 'Float'): ['file2']}}

    infos = OrderedDict([
        ('NS', Info('NS', 1, 'Integer', 'Number samples', None, None))])
    formats = OrderedDict([('NS', Format('NS', 1, 'Float', 'Number samples'))])
    resolved_headers = VcfHeader(infos=infos, formats=formats)

    expected = ['No Header Conflicts Found.\n', '\n']
    self._generate_report_and_assert_contents_equal(expected,
                                                    header_definitions,
                                                    resolved_headers)
 def test_infer_info_fields_combined_conflicts(self):
   variant = self._get_sample_variant_info_ia_cardinality_mismatch()
   infos = {'IS': Info('IS', 1, 'String', '', '', ''),
            'ISI': Info('ISI', 1, 'Integer', '', '', ''),
            'ISF': Info('ISF', 1, 'Float', '', '', ''),
            'IB': Info('IB', 0, 'Flag', '', '', ''),
            'IA': Info('IA', -1, 'Integer', '', '', '')}
   infer_header_fields = infer_headers._InferHeaderFields()
   inferred_infos = infer_header_fields._infer_info_fields(
       variant, vcf_header_io.VcfHeader(infos=infos))
   expected_infos = {'IF': Info('IF', 1, 'Float', '', '', ''),
                     'IA': Info('IA', None, 'Float', '', '', '')}
   self.assertEqual(expected_infos, inferred_infos)
 def test_infer_mismatched_info_field_correct_num(self):
   variant = self._get_sample_variant_info_ia_cardinality_mismatch()
   infos = {'IS': Info('IS', 1, 'String', '', '', ''),
            'ISI': Info('ISI', 1, 'Integer', '', '', ''),
            'ISF': Info('ISF', 1, 'Float', '', '', ''),
            'IF': Info('IF', 1, 'Float', '', '', ''),
            'IB': Info('IB', 0, 'Flag', '', '', ''),
            'IA': Info('IA', -1, 'Float', '', '', '')}
   infer_header_fields = infer_headers._InferHeaderFields()
   corrected_info = infer_header_fields._infer_mismatched_info_field(
       'IA', variant.info.get('IA'),
       vcf_header_io.VcfHeader(infos=infos).infos.get('IA'),
       len(variant.alternate_bases))
   expected = Info('IA', None, 'Float', '', '', '')
   self.assertEqual(expected, corrected_info)
 def test_infer_info_fields_no_conflicts(self):
   variant = self._get_sample_variant_1()
   infos = {'IS': Info('IS', 1, 'String', '', '', ''),
            'ISI': Info('ISI', 1, 'Integer', '', '', ''),
            'ISF': Info('ISF', 1, 'Float', '', '', ''),
            'IF': Info('IF', 1, 'Float', '', '', ''),
            'IB': Info('IB', 0, 'Flag', '', '', ''),
            'IA': Info('IA', -1, 'Float', '', '', '')}
   infer_header_fields = infer_headers._InferHeaderFields()
   inferred_infos = infer_header_fields._infer_info_fields(
       variant, vcf_header_io.VcfHeader(infos=infos))
   self.assertEqual({}, inferred_infos)
  def _infer_info_fields(self, variant, defined_headers):
    # type: (vcfio.Variant, vcf_header_io.VcfHeader) -> Dict[str, Info]
    """Returns inferred info fields.

    Two types of info fields are inferred:
    - The info fields are undefined in the headers.
    - The info fields' definitions provided by the header does not match the
      field value.
    Args:
      variant: variant obj.
      defined_headers: header fields defined in header section of VCF files.
    Returns:
      A dict of (info_key(str), :class:`Info`) for any info field in `variant`
      that is not defined in the header or the definition mismatches the field
      values.
    """
    infos = {}
    for info_field_key, info_field_value in variant.info.iteritems():
      if not defined_headers or info_field_key not in defined_headers.infos:
        if info_field_key in infos:
          raise ValueError(
              'Duplicate INFO field "{}" in variant "{}"'.format(
                  info_field_key, variant))
        logging.warning('Undefined INFO field "%s" in variant "%s"',
                        info_field_key, str(variant))
        infos[info_field_key] = Info(info_field_key,
                                     self._get_field_count(info_field_value),
                                     self._get_field_type(info_field_value),
                                     '',  # NO_DESCRIPTION
                                     '',  # UNKNOWN_SOURCE
                                     '')  # UNKNOWN_VERSION
      else:
        defined_header = defined_headers.infos.get(info_field_key)
        corrected_info = self._infer_mismatched_info_field(
            info_field_key, info_field_value,
            defined_header, len(variant.alternate_bases))
        if corrected_info:
          logging.warning(
              'Incorrect INFO field "%s". Defined as "type=%s,num=%s", '
              'got "%s", in variant "%s"',
              info_field_key, defined_header.get(_HeaderKeyConstants.TYPE),
              str(defined_header.get(_HeaderKeyConstants.NUM)),
              str(info_field_value), str(variant))
          infos[info_field_key] = corrected_info
    return infos
 def test_infer_mismatched_info_field_no_mismatches(self):
   variant = self._get_sample_variant_info_ia_float_2_0_in_list()
   infos = {'IS': Info('IS', 1, 'String', '', '', ''),
            'ISI': Info('ISI', 1, 'Integer', '', '', ''),
            'ISF': Info('ISF', 1, 'Float', '', '', ''),
            'IF': Info('IF', 1, 'Float', '', '', ''),
            'IB': Info('IB', 0, 'Flag', '', '', ''),
            'IA': Info('IA', 'A', 'Integer', '', '', '')}
   infer_header_fields = infer_headers._InferHeaderFields()
   corrected_info = infer_header_fields._infer_mismatched_info_field(
       'IA', variant.info.get('IA'),
       vcf_header_io.VcfHeader(infos=infos).infos.get('IA'),
       len(variant.alternate_bases))
   self.assertEqual(None, corrected_info)
  def test_defined_fields_filtered_two_variants(self):
    # Only INFO and FORMAT in the first variants are already defined in the
    # header section of the VCF files.
    with TestPipeline() as p:
      vcf_headers = self._get_sample_header_fields()
      vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers])
      variant_1 = self._get_sample_variant_1()
      variant_2 = self._get_sample_variant_2()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferUndefinedHeaderFields' >>
          infer_undefined_headers.InferUndefinedHeaderFields(
              pvalue.AsSingleton(vcf_headers_side_input)))

      expected_infos = {'IS_2': Info('IS_2', 1, 'String', '', '', '')}
      expected_formats = {'FI_2': Format('FI_2', 1, 'String', '')}
      expected = vcf_header_io.VcfHeader(
          infos=expected_infos, formats=expected_formats)
      assert_that(inferred_headers, equal_to([expected]))
      p.run()
def _infer_mismatched_info_field(
        field_key,  # type: str
        field_value,  # type: Any
        defined_header,  # type: Dict
        num_alternate_bases  # type: int
):
    # type: (...) -> Optional[Info]
    """Returns corrected info if there are mismatches.

  Two mismatches are handled:
  - Defined num is `A`, but the provided values do not have the same
    cardinality as the alternate bases. Correct the num to be `None`.
  - Defined type is `Integer`, but the provided value is float. Correct the
    type to be `Float`.

  Args:
    field_key: the info field key.
    field_value: the value of the field key given in the variant.
    defined_header: The definition of `field_key` in the header.
    num_alternate_bases: number of the alternate bases.

  Returns:
    Corrected info definition if there are mismatches.
  """
    corrected_num = defined_header.get(_HeaderKeyConstants.NUM)
    if (corrected_num == field_counts[_FIELD_COUNT_ALTERNATE_ALLELE]
            and len(field_value) != num_alternate_bases):
        corrected_num = field_counts['.']

    corrected_type = _get_corrected_type(
        defined_header.get(_HeaderKeyConstants.TYPE), field_value)

    if (corrected_type != defined_header.get(_HeaderKeyConstants.TYPE)
            or corrected_num != defined_header.get(_HeaderKeyConstants.NUM)):
        return Info(field_key, corrected_num, corrected_type,
                    defined_header.get(_HeaderKeyConstants.DESC),
                    defined_header.get(_HeaderKeyConstants.SOURCE),
                    defined_header.get(_HeaderKeyConstants.VERSION))
    return None