예제 #1
0
    def test_header_fields_inferred_one_variant(self):
        with TestPipeline() as p:
            variant = self._get_sample_variant_1()
            inferred_headers = (
                p
                | Create([variant])
                | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
                    defined_headers=None, infer_headers=True))

            expected_infos = {
                'IS': Info('IS', 1, 'String', '', '', ''),
                'ISI': Info('ISI', 1, 'Integer', '', '', ''),
                'ISF': Info('ISF', 1, 'Float', '', '', ''),
                'IF': Info('IF', 1, 'Float', '', '', ''),
                'IB': Info('IB', 0, 'Flag', '', '', ''),
                'IA': Info('IA', None, 'Integer', '', '', '')
            }
            expected_formats = {
                'FI': Format('FI', 1, 'Integer', ''),
                'FU': Format('FU', None, 'Float', '')
            }

            expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                               formats=expected_formats)
            assert_that(inferred_headers, equal_to([expected]))
            p.run()
 def test_bigquery_field_name_sanitize(self):
     infos = OrderedDict([
         ('_', Info('_', 1, 'String', 'desc', 'src', 'v')),
         ('_A', Info('_A', 1, 'String', 'desc', 'src', 'v')),
         ('0a', Info('0a', 1, 'String', 'desc', 'src', 'v')),
         ('A-B*C', Info('A-B*C', 1, 'String', 'desc', 'src', 'v')),
         ('I-A', Info('I-A', field_counts['A'], 'Float', 'desc', 'src',
                      'v')),
         ('OK_info_09', Format('OK_info_09', 1, 'String', 'desc'))
     ])
     formats = OrderedDict([('a^b', Format('a^b', 1, 'String', 'desc')),
                            ('OK_format_09',
                             Format('OK_format_09', 1, 'String', 'desc'))])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(alt_fields=['I_A'],
                                        call_fields=['a_b', 'OK_format_09'],
                                        info_fields=[
                                            'field__', 'field__A',
                                            'field_0a', 'A_B_C',
                                            'OK_info_09'
                                        ]),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
예제 #3
0
    def _get_sample_header_fields(self, with_annotation=False):
        """Provides a simple `VcfHeader` with info and format fields

    Args:
      with_annotation: Can be bool or list of tuples. Tuples should be
        additional annotation fields in the format (key, `Info`).
    """
        infos = OrderedDict([
            ('IS', Info('I1', 1, 'String', 'desc', 'src', 'v')),
            ('ISI', Info('ISI', 1, 'Int', 'desc', 'src', 'v')),
            ('ISF', Info('ISF', 1, 'Float', 'desc', 'src', 'v')),
            ('IF', Info('IF', 1, 'Float', 'desc', 'src', 'v')),
            ('IB', Info('I1', 1, 'Flag', 'desc', 'src', 'v')),
            ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src',
                        'v'))
        ])
        if with_annotation:
            infos['CSQ'] = Info(
                'CSQ', field_counts['.'], 'String',
                'Annotations from VEP. Format: Allele|Gene|Position|Score',
                'src', 'v')
            if isinstance(with_annotation, list):
                for key, value in with_annotation:
                    infos[key] = value
        formats = OrderedDict([
            ('FS', Format('FS', 1, 'String', 'desc')),
            ('FI', Format('FI', 2, 'Integer', 'desc')),
            ('FU', Format('FU', field_counts['.'], 'Float', 'desc')),
            ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
            ('PS', Format('PS', 1, 'Integer', 'Special PS key'))
        ])
        return vcf_header_io.VcfHeader(infos=infos, formats=formats)
  def test_header_fields_inferred_from_two_variants(self):
    with TestPipeline() as p:
      variant_1 = self._get_sample_variant_1()
      variant_2 = self._get_sample_variant_2()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferUndefinedHeaderFields' >>
          infer_undefined_headers.InferUndefinedHeaderFields(
              defined_headers=None))

      expected_infos = {'IS': Info('IS', 1, 'String', '', '', ''),
                        'ISI': Info('ISI', 1, 'Integer', '', '', ''),
                        'ISF': Info('ISF', 1, 'Float', '', '', ''),
                        'IF': Info('IF', 1, 'Float', '', '', ''),
                        'IB': Info('IB', 0, 'Flag', '', '', ''),
                        'IA': Info('IA', None, 'Float', '', '', ''),
                        'IS_2': Info('IS_2', 1, 'String', '', '', '')}
      expected_formats = {'FI': Format('FI', 1, 'Integer', ''),
                          'FU': Format('FU', None, 'Float', ''),
                          'FI_2': Format('FI_2', 1, 'Integer', '')}

      expected = vcf_header_io.VcfHeader(
          infos=expected_infos, formats=expected_formats)
      assert_that(inferred_headers,
                  asserts.header_fields_equal_ignore_order([expected]))
      p.run()
 def _get_sample_header_fields(self):
   infos = OrderedDict([
       ('IS', Info('I1', 1, 'String', 'desc', 'src', 'v')),
       ('IF', Info('I1', 1, 'Flag', 'desc', 'src', 'v')),
       ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))])
   formats = OrderedDict([
       ('FS', Format('FS', 1, 'String', 'desc')),
       ('FI', Format('FI', 2, 'Integer', 'desc')),
       ('FU', Format('FU', field_counts['.'], 'Float', 'desc')),
       ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
       ('PS', Format('PS', 1, 'Integer', 'Special PS key'))])
   return vcf_header_io.VcfHeader(infos=infos, formats=formats)
 def test_infer_format_fields_no_conflicts(self):
   variant = self._get_sample_variant_1()
   formats = OrderedDict([
       ('FS', Format('FS', 1, 'String', 'desc')),
       ('FI', Format('FI', 2, 'Integer', 'desc')),
       ('FU', Format('FU', field_counts['.'], 'Float', 'desc')),
       ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
       ('PS', Format('PS', 1, 'Integer', 'Special PS key'))])
   infer_header_fields = infer_headers._InferHeaderFields()
   header = infer_header_fields._infer_format_fields(
       variant, vcf_header_io.VcfHeader(formats=formats))
   self.assertEqual({}, header)
  def test_report_conflicted_and_inferred_headers(self):
    header_definitions = merge_header_definitions.VcfHeaderDefinitions()
    header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1'],
                                        Definition(1, 'Integer'): ['file2']}}

    infos = OrderedDict([
        ('NS', Info('NS', 1, 'Float', 'Number samples', None, None))])
    formats = OrderedDict([
        ('DP', Format('DP', 2, 'Float', 'Total Depth'))])
    resolved_headers = VcfHeader(infos=infos, formats=formats)
    inferred_headers = VcfHeader(formats=formats)
    expected = [
        preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n',
        preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n',
        (preprocess_reporter._DELIMITER).join([
            'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n']),
        (preprocess_reporter._DELIMITER).join([
            ' ', ' ', 'num=1 type=Integer', 'file2', ' \n']),
        '\n',
        preprocess_reporter._InconsistencyType.UNDEFINED_HEADERS + '\n',
        preprocess_reporter._HeaderLine.UNDEFINED_FIELD_HEADER + '\n',
        (preprocess_reporter._DELIMITER).join([
            'DP', 'FORMAT', 'num=2 type=Float\n']),
        '\n'
    ]
    self._generate_report_and_assert_contents_equal(expected,
                                                    header_definitions,
                                                    resolved_headers,
                                                    inferred_headers)
  def _infer_mismatched_format_field(self,
                                     field_key,  # type: str
                                     field_value,  # type: Any
                                     defined_header  # type: Dict
                                    ):
    # type: (...) -> Optional[Format]
    """Returns corrected format if there are mismatches.

    One type of mismatches is handled:
    - Defined type is `Integer`, but the provided value is float. Correct the
      type to be `Float`.
    Args:
      field_key: the format field key.
      field_value: the value of the field key given in the variant.
      defined_header: The definition of `field_key` in the header.
    Returns:
      Corrected format definition if there are mismatches.
    """
    corrected_type = self._get_corrected_type(
        defined_header.get(_HeaderKeyConstants.TYPE), field_value)
    if corrected_type != defined_header.get(_HeaderKeyConstants.TYPE):
      return Format(field_key,
                    defined_header.get(_HeaderKeyConstants.NUM),
                    corrected_type,
                    defined_header.get(_HeaderKeyConstants.DESC))
    return None
예제 #9
0
    def _infer_undefined_format_fields(self, variant, defined_headers):
        """Returns format fields not defined in the headers.

    Args:
      variant (:class:`vcfio.Variant`): variant obj.
      defined_headers (:class:`vcf_header_io.VcfHeader`): header fields defined
        in header section of VCF files.
    Returns:
      A dict of (format_key(str), :class:`Format`) for any format key in
      `variant` that is not defined in the header.
    """
        formats = {}
        for call in variant.calls:
            for format_key, format_value in call.info.iteritems():
                if not defined_headers or format_key not in defined_headers.formats:
                    if format_key in formats:
                        raise ValueError(
                            'Invalid VCF file. Duplicate FORMAT field in variant {}'
                            .format(variant))
                    formats[format_key] = Format(
                        format_key, self._get_field_count(format_value),
                        self._get_field_type(format_value),
                        '')  # NO_DESCRIPTION
            # No point in proceeding. All other calls have the same FORMAT.
            break
        return formats
  def test_pipeline(self):
    infos = {'IS': Info('IS', 1, 'String', '', '', ''),
             'ISI': Info('ISI', 1, 'Integer', '', '', ''),
             'ISF': Info('ISF', 1, 'Float', '', '', ''),
             'IB': Info('IB', 0, 'Flag', '', '', ''),
             'IA': Info('IA', -1, 'Integer', '', '', '')}
    formats = OrderedDict([
        ('FS', Format('FS', 1, 'String', 'desc')),
        ('FI', Format('FI', 2, 'Integer', 'desc')),
        ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
        ('PS', Format('PS', 1, 'Integer', 'Special PS key'))])

    with TestPipeline() as p:
      variant_1 = self._get_sample_variant_info_ia_cardinality_mismatch()
      variant_2 = self._get_sample_variant_format_fi_float_value()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferHeaderFields' >>
          infer_headers.InferHeaderFields(
              defined_headers=vcf_header_io.VcfHeader(infos=infos,
                                                      formats=formats),
              allow_incompatible_records=True))

      expected_infos = {'IA': Info('IA', None, 'Float', '', '', ''),
                        'IF': Info('IF', 1, 'Float', '', '', '')}
      expected_formats = {'FI': Format('FI', 2, 'Float', 'desc'),
                          'FU': Format('FU', None, 'Float', '')}
      expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                         formats=expected_formats)
      assert_that(inferred_headers,
                  asserts.header_fields_equal_ignore_order([expected]))
      p.run()
 def test_info_and_format_header_fields(self):
     infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src',
                                      'v')),
                          ('IA',
                           Info('IA', field_counts['A'], 'Integer', 'desc',
                                'src', 'v'))])
     # GT and PS should not be set as they're already included in special
     # 'genotype' and 'phaseset' fields.
     formats = OrderedDict([
         ('F1', Format('F1', 1, 'String', 'desc')),
         ('F2', Format('F2', 2, 'Integer', 'desc')),
         ('FU', Format('FU', field_counts['.'], 'Float', 'desc')),
         ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
         ('PS', Format('PS', 1, 'Integer', 'Special PS key'))
     ])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(alt_fields=['IA'],
                                        call_fields=['F1', 'F2', 'FU'],
                                        info_fields=['I1']),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
  def test_report_no_conflicts(self):
    header_definitions = merge_header_definitions.VcfHeaderDefinitions()
    header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1']}}
    header_definitions._formats = {'NS': {Definition(1, 'Float'): ['file2']}}

    infos = OrderedDict([
        ('NS', Info('NS', 1, 'Integer', 'Number samples', None, None))])
    formats = OrderedDict([('NS', Format('NS', 1, 'Float', 'Number samples'))])
    resolved_headers = VcfHeader(infos=infos, formats=formats)

    expected = ['No Header Conflicts Found.\n', '\n']
    self._generate_report_and_assert_contents_equal(expected,
                                                    header_definitions,
                                                    resolved_headers)
def infer_format_fields(variant, defined_headers):
    # type: (vcfio.Variant, vcf_header_io.VcfHeader) -> Dict[str, Format]
    """Returns inferred format fields.

  Two types of format fields are inferred:
  - The format fields are undefined in the headers.
  - The format definition provided by the headers does not match the field
    values.

  Args:
    variant: variant object
    defined_headers: header fields defined in header section of VCF files.

  Returns:
    A dict of (format_key, `Format`) for any format key in
    `variant` that is not defined in the header or the definition mismatches
    the field values.
  """
    formats = {}
    for call in variant.calls:
        for format_key, format_value in call.info.iteritems():
            if not defined_headers or format_key not in defined_headers.formats:
                if format_key in formats:
                    raise ValueError(
                        'Duplicate FORMAT field "{}" in variant "{}"'.format(
                            format_key, variant))
                logging.warning('Undefined FORMAT field "%s" in variant "%s"',
                                format_key, str(variant))
                formats[format_key] = Format(format_key,
                                             _get_field_count(format_value),
                                             _get_field_type(format_value),
                                             '')  # NO_DESCRIPTION
        # No point in proceeding. All other calls have the same FORMAT.
        break
    for call in variant.calls:
        for format_key, format_value in call.info.iteritems():
            if defined_headers and format_key in defined_headers.formats:
                defined_header = defined_headers.formats.get(format_key)
                corrected_format = _infer_mismatched_format_field(
                    format_key, format_value, defined_header)
                if corrected_format:
                    logging.warning(
                        'Incorrect FORMAT field "%s". Defined as "type=%s,num=%s", '
                        'got "%s" in variant "%s"', format_key,
                        defined_header.get(_HeaderKeyConstants.TYPE),
                        str(defined_header.get(_HeaderKeyConstants.NUM)),
                        str(format_value), str(variant))
                    formats[format_key] = corrected_format
    return formats
    def test_report_inferred_headers_only(self):
        header_definitions = merge_header_definitions.VcfHeaderDefinitions()
        formats = OrderedDict([('DP', Format('DP', 2, 'Float',
                                             'Total Depth'))])

        inferred_headers = VcfHeader(formats=formats)
        expected = [
            'No Header Conflicts Found.\n', '\n',
            preprocess_reporter._InconsistencyType.UNDEFINED_HEADERS + '\n',
            preprocess_reporter._HeaderLine.UNDEFINED_FIELD_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join(
                ['DP', 'FORMAT', 'num=2 type=Float\n']), '\n'
        ]
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, inferred_headers=inferred_headers)
 def test_variant_merger_modify_schema(self):
     infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src',
                                      'v')),
                          ('IA',
                           Info('IA', field_counts['A'], 'Integer', 'desc',
                                'src', 'v'))])
     formats = OrderedDict([('F1', Format('F1', 1, 'String', 'desc'))])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(
             alt_fields=['IA'],
             call_fields=['F1'],
             info_fields=['I1', 'ADDED_BY_MERGER']),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields),
             variant_merger=_DummyVariantMergeStrategy()))
 def test_infer_format_fields_combined_conflicts(self):
   variant = self._get_sample_variant_format_fi_float_value()
   formats = OrderedDict([
       ('FS', Format('FS', 1, 'String', 'desc')),
       ('FI', Format('FI', 2, 'Integer', 'desc')),
       ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
       ('PS', Format('PS', 1, 'Integer', 'Special PS key'))])
   infer_header_fields = infer_headers._InferHeaderFields()
   inferred_formats = infer_header_fields._infer_format_fields(
       variant, vcf_header_io.VcfHeader(formats=formats))
   expected_formats = {'FI': Format('FI', 2, 'Float', 'desc'),
                       'FU': Format('FU', field_counts['.'], 'Float', '')}
   self.assertEqual(expected_formats, inferred_formats)
 def test_infer_mismatched_format_field(self):
   variant = self._get_sample_variant_format_fi_float_value()
   formats = OrderedDict([
       ('FS', Format('FS', 1, 'String', 'desc')),
       ('FI', Format('FI', 2, 'Integer', 'desc')),
       ('FU', Format('FU', field_counts['.'], 'Float', 'desc')),
       ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
       ('PS', Format('PS', 1, 'Integer', 'Special PS key'))])
   infer_header_fields = infer_headers._InferHeaderFields()
   corrected_format = infer_header_fields._infer_mismatched_format_field(
       'FI', variant.calls[0].info.get('FI'),
       vcf_header_io.VcfHeader(formats=formats).formats.get('FI'))
   expected_formats = Format('FI', 2, 'Float', 'desc')
   self.assertEqual(expected_formats, corrected_format)
  def test_defined_fields_filtered_two_variants(self):
    # Only INFO and FORMAT in the first variants are already defined in the
    # header section of the VCF files.
    with TestPipeline() as p:
      vcf_headers = self._get_sample_header_fields()
      vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers])
      variant_1 = self._get_sample_variant_1()
      variant_2 = self._get_sample_variant_2()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferUndefinedHeaderFields' >>
          infer_undefined_headers.InferUndefinedHeaderFields(
              pvalue.AsSingleton(vcf_headers_side_input)))

      expected_infos = {'IS_2': Info('IS_2', 1, 'String', '', '', '')}
      expected_formats = {'FI_2': Format('FI_2', 1, 'String', '')}
      expected = vcf_header_io.VcfHeader(
          infos=expected_infos, formats=expected_formats)
      assert_that(inferred_headers, equal_to([expected]))
      p.run()