def test_info_header_fields(self):
        infos = OrderedDict([
            ('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')),
            ('I2', Info('I2', 2, 'Integer', 'desc', 'src', 'v')),
            ('IA', Info('IA', field_counts['A'], 'Float', 'desc', 'src', 'v')),
            ('IU',
             Info('IU', field_counts['.'], 'Character', 'desc', 'src', 'v')),
            ('IG', Info('IG', field_counts['G'], 'String', 'desc', 'src',
                        'v')),
            ('I0', Info('I0', 0, 'Flag', 'desc', 'src', 'v')),
            ('IA2', Info('IA2', field_counts['A'], 'Float', 'desc', 'src',
                         'v')),
            (
                'END',  # END should not be included in the generated schema.
                Info('END', 1, 'Integer', 'Special END key', 'src', 'v'))
        ])
        header_fields = vcf_header_parser.HeaderFields(infos, {})

        self._assert_fields_equal(
            self._generate_expected_fields(
                alt_fields=['IA', 'IA2'],
                info_fields=['I1', 'I2', 'IU', 'IG', 'I0']),
            bigquery_vcf_schema.generate_schema_from_header_fields(
                header_fields,
                processed_variant.ProcessedVariantFactory(header_fields)))

        # Test with split_alternate_allele_info_fields=False.
        actual_schema = bigquery_vcf_schema.generate_schema_from_header_fields(
            header_fields,
            processed_variant.ProcessedVariantFactory(
                header_fields, split_alternate_allele_info_fields=False))
        self._assert_fields_equal(
            self._generate_expected_fields(
                info_fields=['I1', 'I2', 'IA', 'IU', 'IG', 'I0', 'IA2']),
            actual_schema)
        # Verify types and modes.
        expected_type_modes = {
            'I1': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_NULLABLE),
            'I2': (TableFieldConstants.TYPE_INTEGER,
                   TableFieldConstants.MODE_REPEATED),
            'IA': (TableFieldConstants.TYPE_FLOAT,
                   TableFieldConstants.MODE_REPEATED),
            'IU': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_REPEATED),
            'IG': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_REPEATED),
            'I0': (TableFieldConstants.TYPE_BOOLEAN,
                   TableFieldConstants.MODE_NULLABLE),
            'IA2':
            (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED)
        }
        for field in actual_schema.fields:
            if field.name in expected_type_modes:
                expected_type, expected_mode = expected_type_modes[field.name]
                self.assertEqual(expected_type, field.type)
                self.assertEqual(expected_mode, field.mode)
 def test_no_header_fields(self):
     header_fields = vcf_header_parser.HeaderFields({}, {})
     self._assert_fields_equal(
         self._generate_expected_fields(),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
 def test_bigquery_field_name_sanitize(self):
     infos = OrderedDict([
         ('_', Info('_', 1, 'String', 'desc', 'src', 'v')),
         ('_A', Info('_A', 1, 'String', 'desc', 'src', 'v')),
         ('0a', Info('0a', 1, 'String', 'desc', 'src', 'v')),
         ('A-B*C', Info('A-B*C', 1, 'String', 'desc', 'src', 'v')),
         ('I-A', Info('I-A', field_counts['A'], 'Float', 'desc', 'src',
                      'v')),
         ('OK_info_09', Format('OK_info_09', 1, 'String', 'desc'))
     ])
     formats = OrderedDict([('a^b', Format('a^b', 1, 'String', 'desc')),
                            ('OK_format_09',
                             Format('OK_format_09', 1, 'String', 'desc'))])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(alt_fields=['I_A'],
                                        call_fields=['a_b', 'OK_format_09'],
                                        info_fields=[
                                            'field__', 'field__A',
                                            'field_0a', 'A_B_C',
                                            'OK_info_09'
                                        ]),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
  def __init__(
      self,
      output_table,  # type: str
      header_fields,  # type: vcf_header_io.VcfHeader
      variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
      proc_var_factory=None,  # type: processed_variant.ProcessedVariantFactory
      append=False,  # type: bool
      update_schema_on_append=False,  # type: bool
      allow_incompatible_records=False,  # type: bool
      omit_empty_sample_calls=False,  # type: bool
      num_bigquery_write_shards=1  # type: int
      ):
    # type: (...) -> None
    """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      update_schema_on_append: If true, BigQuery schema will be updated by
        combining the existing schema and the new schema if they are compatible.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
    """
    self._output_table = output_table
    self._header_fields = header_fields
    self._variant_merger = variant_merger
    self._proc_var_factory = proc_var_factory
    self._append = append
    self._schema = bigquery_vcf_schema.generate_schema_from_header_fields(
        self._header_fields, self._proc_var_factory, self._variant_merger)
    # Resolver makes extra effort to resolve conflict when flag
    # allow_incompatible_records is set.
    self._bigquery_row_generator = bigquery_row_generator.BigQueryRowGenerator(
        bigquery_schema_descriptor.SchemaDescriptor(self._schema),
        vcf_field_conflict_resolver.FieldConflictResolver(
            resolve_always=allow_incompatible_records))

    self._allow_incompatible_records = allow_incompatible_records
    self._omit_empty_sample_calls = omit_empty_sample_calls
    self._num_bigquery_write_shards = num_bigquery_write_shards
    if update_schema_on_append:
      self._update_bigquery_schema_on_append()
Пример #5
0
 def expand(self, pcoll):
   return (pcoll
           | 'ConvertToBigQueryTableRow' >> beam.ParDo(
               _ConvertToBigQueryTableRow(self._omit_empty_sample_calls))
           | 'WriteToBigQuery' >> beam.io.Write(beam.io.BigQuerySink(
               self._output_table,
               schema=bigquery_vcf_schema.generate_schema_from_header_fields(
                   self._header_fields,
                   self._proc_var_factory,
                   self._variant_merger),
               create_disposition=(
                   beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
               write_disposition=(
                   beam.io.BigQueryDisposition.WRITE_APPEND
                   if self._append
                   else beam.io.BigQueryDisposition.WRITE_TRUNCATE))))
 def test_variant_merger_modify_schema(self):
     infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src',
                                      'v')),
                          ('IA',
                           Info('IA', field_counts['A'], 'Integer', 'desc',
                                'src', 'v'))])
     formats = OrderedDict([('F1', Format('F1', 1, 'String', 'desc'))])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(
             alt_fields=['IA'],
             call_fields=['F1'],
             info_fields=['I1', 'ADDED_BY_MERGER']),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields),
             variant_merger=_DummyVariantMergeStrategy()))
Пример #7
0
  def __init__(
      self,
      output_table,  # type: str
      header_fields,  # type: vcf_header_parser.HeaderFields
      variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
      proc_var_factory=None,  # type: processed_variant.ProcessedVariantFactory
      append=False,  # type: bool
      allow_incompatible_records=False,  # type: bool
      omit_empty_sample_calls=False  # type: bool
  ):
    """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      header_fields: A `namedtuple` containing representative header fields for
        all variants. This is needed for dynamically generating the schema.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
    """
    self._output_table = output_table
    self._header_fields = header_fields
    self._variant_merger = variant_merger
    self._proc_var_factory = proc_var_factory
    self._append = append
    self._schema = bigquery_vcf_schema.generate_schema_from_header_fields(
        self._header_fields, self._proc_var_factory, self._variant_merger)
    self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
        self._schema)
    self._allow_incompatible_records = allow_incompatible_records
    self._omit_empty_sample_calls = omit_empty_sample_calls
 def test_info_and_format_header_fields(self):
     infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src',
                                      'v')),
                          ('IA',
                           Info('IA', field_counts['A'], 'Integer', 'desc',
                                'src', 'v'))])
     # GT and PS should not be set as they're already included in special
     # 'genotype' and 'phaseset' fields.
     formats = OrderedDict([
         ('F1', Format('F1', 1, 'String', 'desc')),
         ('F2', Format('F2', 2, 'Integer', 'desc')),
         ('FU', Format('FU', field_counts['.'], 'Float', 'desc')),
         ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
         ('PS', Format('PS', 1, 'Integer', 'Special PS key'))
     ])
     header_fields = vcf_header_parser.HeaderFields(infos, formats)
     self._assert_fields_equal(
         self._generate_expected_fields(alt_fields=['IA'],
                                        call_fields=['F1', 'F2', 'FU'],
                                        info_fields=['I1']),
         bigquery_vcf_schema.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))