def test_info_header_fields(self): infos = OrderedDict([ ('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('I2', Info('I2', 2, 'Integer', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Float', 'desc', 'src', 'v')), ('IU', Info('IU', field_counts['.'], 'Character', 'desc', 'src', 'v')), ('IG', Info('IG', field_counts['G'], 'String', 'desc', 'src', 'v')), ('I0', Info('I0', 0, 'Flag', 'desc', 'src', 'v')), ('IA2', Info('IA2', field_counts['A'], 'Float', 'desc', 'src', 'v')), ( 'END', # END should not be included in the generated schema. Info('END', 1, 'Integer', 'Special END key', 'src', 'v')) ]) header_fields = vcf_header_parser.HeaderFields(infos, {}) self._assert_fields_equal( self._generate_expected_fields( alt_fields=['IA', 'IA2'], info_fields=['I1', 'I2', 'IU', 'IG', 'I0']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields))) # Test with split_alternate_allele_info_fields=False. actual_schema = bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=False)) self._assert_fields_equal( self._generate_expected_fields( info_fields=['I1', 'I2', 'IA', 'IU', 'IG', 'I0', 'IA2']), actual_schema) # Verify types and modes. expected_type_modes = { 'I1': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_NULLABLE), 'I2': (TableFieldConstants.TYPE_INTEGER, TableFieldConstants.MODE_REPEATED), 'IA': (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED), 'IU': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_REPEATED), 'IG': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_REPEATED), 'I0': (TableFieldConstants.TYPE_BOOLEAN, TableFieldConstants.MODE_NULLABLE), 'IA2': (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED) } for field in actual_schema.fields: if field.name in expected_type_modes: expected_type, expected_mode = expected_type_modes[field.name] self.assertEqual(expected_type, field.type) self.assertEqual(expected_mode, field.mode)
def test_no_header_fields(self): header_fields = vcf_header_parser.HeaderFields({}, {}) self._assert_fields_equal( self._generate_expected_fields(), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_bigquery_field_name_sanitize(self): infos = OrderedDict([ ('_', Info('_', 1, 'String', 'desc', 'src', 'v')), ('_A', Info('_A', 1, 'String', 'desc', 'src', 'v')), ('0a', Info('0a', 1, 'String', 'desc', 'src', 'v')), ('A-B*C', Info('A-B*C', 1, 'String', 'desc', 'src', 'v')), ('I-A', Info('I-A', field_counts['A'], 'Float', 'desc', 'src', 'v')), ('OK_info_09', Format('OK_info_09', 1, 'String', 'desc')) ]) formats = OrderedDict([('a^b', Format('a^b', 1, 'String', 'desc')), ('OK_format_09', Format('OK_format_09', 1, 'String', 'desc'))]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields(alt_fields=['I_A'], call_fields=['a_b', 'OK_format_09'], info_fields=[ 'field__', 'field__A', 'field_0a', 'A_B_C', 'OK_info_09' ]), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def __init__( self, output_table, # type: str header_fields, # type: vcf_header_io.VcfHeader variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy proc_var_factory=None, # type: processed_variant.ProcessedVariantFactory append=False, # type: bool update_schema_on_append=False, # type: bool allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool num_bigquery_write_shards=1 # type: int ): # type: (...) -> None """Initializes the transform. Args: output_table: Full path of the output BigQuery table. header_fields: Representative header fields for all variants. This is needed for dynamically generating the schema. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. append: If true, existing records in output_table will not be overwritten. New records will be appended to those that already exist. update_schema_on_append: If true, BigQuery schema will be updated by combining the existing schema and the new schema if they are compatible. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. num_bigquery_write_shards: If > 1, we will limit number of sources which are used for writing to the output BigQuery table. """ self._output_table = output_table self._header_fields = header_fields self._variant_merger = variant_merger self._proc_var_factory = proc_var_factory self._append = append self._schema = bigquery_vcf_schema.generate_schema_from_header_fields( self._header_fields, self._proc_var_factory, self._variant_merger) # Resolver makes extra effort to resolve conflict when flag # allow_incompatible_records is set. self._bigquery_row_generator = bigquery_row_generator.BigQueryRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(self._schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls self._num_bigquery_write_shards = num_bigquery_write_shards if update_schema_on_append: self._update_bigquery_schema_on_append()
def expand(self, pcoll): return (pcoll | 'ConvertToBigQueryTableRow' >> beam.ParDo( _ConvertToBigQueryTableRow(self._omit_empty_sample_calls)) | 'WriteToBigQuery' >> beam.io.Write(beam.io.BigQuerySink( self._output_table, schema=bigquery_vcf_schema.generate_schema_from_header_fields( self._header_fields, self._proc_var_factory, self._variant_merger), create_disposition=( beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=( beam.io.BigQueryDisposition.WRITE_APPEND if self._append else beam.io.BigQueryDisposition.WRITE_TRUNCATE))))
def test_variant_merger_modify_schema(self): infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))]) formats = OrderedDict([('F1', Format('F1', 1, 'String', 'desc'))]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1'], info_fields=['I1', 'ADDED_BY_MERGER']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields), variant_merger=_DummyVariantMergeStrategy()))
def __init__( self, output_table, # type: str header_fields, # type: vcf_header_parser.HeaderFields variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy proc_var_factory=None, # type: processed_variant.ProcessedVariantFactory append=False, # type: bool allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False # type: bool ): """Initializes the transform. Args: output_table: Full path of the output BigQuery table. header_fields: A `namedtuple` containing representative header fields for all variants. This is needed for dynamically generating the schema. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. append: If true, existing records in output_table will not be overwritten. New records will be appended to those that already exist. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. """ self._output_table = output_table self._header_fields = header_fields self._variant_merger = variant_merger self._proc_var_factory = proc_var_factory self._append = append self._schema = bigquery_vcf_schema.generate_schema_from_header_fields( self._header_fields, self._proc_var_factory, self._variant_merger) self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor( self._schema) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls
def test_info_and_format_header_fields(self): infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))]) # GT and PS should not be set as they're already included in special # 'genotype' and 'phaseset' fields. formats = OrderedDict([ ('F1', Format('F1', 1, 'String', 'desc')), ('F2', Format('F2', 2, 'Integer', 'desc')), ('FU', Format('FU', field_counts['.'], 'Float', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key')) ]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields(alt_fields=['IA'], call_fields=['F1', 'F2', 'FU'], info_fields=['I1']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))