def test_info_header_fields(self): infos = OrderedDict([ ('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('I2', Info('I2', 2, 'Integer', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Float', 'desc', 'src', 'v')), ('IU', Info('IU', field_counts['.'], 'Character', 'desc', 'src', 'v')), ('IG', Info('IG', field_counts['G'], 'String', 'desc', 'src', 'v')), ('I0', Info('I0', 0, 'Flag', 'desc', 'src', 'v')), ('IA2', Info('IA2', field_counts['A'], 'Float', 'desc', 'src', 'v')), ( 'END', # END should not be included in the generated schema. Info('END', 1, 'Integer', 'Special END key', 'src', 'v')) ]) header_fields = vcf_header_io.VcfHeader(infos=infos) self._validate_schema( self._generate_expected_fields( alt_fields=['IA', 'IA2'], info_fields=['I1', 'I2', 'IU', 'IG', 'I0']), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields))) # Test with split_alternate_allele_info_fields=False. actual_schema = (schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=False))) self._validate_schema( self._generate_expected_fields( info_fields=['I1', 'I2', 'IA', 'IU', 'IG', 'I0', 'IA2']), actual_schema) # Verify types and modes. expected_type_modes = { 'I1': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_NULLABLE), 'I2': (TableFieldConstants.TYPE_INTEGER, TableFieldConstants.MODE_REPEATED), 'IA': (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED), 'IU': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_REPEATED), 'IG': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_REPEATED), 'I0': (TableFieldConstants.TYPE_BOOLEAN, TableFieldConstants.MODE_NULLABLE), 'IA2': (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED) } for field in actual_schema.fields: if field.name in expected_type_modes: expected_type, expected_mode = expected_type_modes[field.name] self.assertEqual(expected_type, field.type) self.assertEqual(expected_mode, field.mode)
def test_no_header_fields(self): header_fields = vcf_header_io.VcfHeader() self._validate_schema( self._generate_expected_fields(), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_bigquery_field_name_sanitize(self): infos = OrderedDict([ ('_', Info('_', 1, 'String', 'desc', 'src', 'v')), ('_A', Info('_A', 1, 'String', 'desc', 'src', 'v')), ('0a', Info('0a', 1, 'String', 'desc', 'src', 'v')), ('A-B*C', Info('A-B*C', 1, 'String', 'desc', 'src', 'v')), ('I-A', Info('I-A', field_counts['A'], 'Float', 'desc', 'src', 'v')), ('OK_info_09', Format('OK_info_09', 1, 'String', 'desc')) ]) formats = OrderedDict([('a^b', Format('a^b', 1, 'String', 'desc')), ('OK_format_09', Format('OK_format_09', 1, 'String', 'desc'))]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields(alt_fields=['I_A'], call_fields=['a_b', 'OK_format_09'], info_fields=[ 'field__', 'field__A', 'field_0a', 'A_B_C', 'OK_info_09' ]), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_no_header_fields_with_sample_name(self): header_fields = vcf_header_io.VcfHeader() self._validate_schema( self._generate_expected_fields(include_call_name=True), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields), include_call_name=True))
def test_schema_to_vcf_header_to_schema(self): original_schema = bigquery_schema_util.get_sample_table_schema() header = schema_converter.generate_header_fields_from_schema( original_schema) reconstructed_schema = ( schema_converter.generate_schema_from_header_fields( header, processed_variant.ProcessedVariantFactory(header))) self.assertEqual(_get_fields_from_schema(reconstructed_schema), _get_fields_from_schema(original_schema))
def test_variant_merger_modify_schema(self): infos = OrderedDict([ ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))]) formats = OrderedDict([('F1', createFormat('F1', 1, 'String', 'desc'))]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1'], info_fields=['I1', 'ADDED_BY_MERGER']), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields), variant_merger=_DummyVariantMergeStrategy()))
def __init__( self, output_path, # type: str header_fields, # type: vcf_header_io.VcfHeader proc_var_factory, # type: processed_variant.ProcessedVariantFactory variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool null_numeric_value_replacement=None # type: int ): # type: (...) -> None """Initializes the transform. Args: output_path: The path under which output Avro files are generated. header_fields: Representative header fields for all variants. This is needed for dynamically generating the schema. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_path = output_path self._proc_var_factory = proc_var_factory table_schema = (schema_converter.generate_schema_from_header_fields( header_fields, proc_var_factory, variant_merger)) self._avro_schema = avro.schema.parse( schema_converter.convert_table_schema_to_json_avro_schema( table_schema)) self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(table_schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls
def test_vcf_header_to_schema_to_vcf_header(self): infos = OrderedDict([ ('I1', createInfo('I1', '.', 'String', 'desc', None, None)), ('IA', createInfo('IA', '.', 'Integer', 'desc', None, None))]) formats = OrderedDict([ ('F1', createFormat('F1', '.', 'String', 'desc')), ('F2', createFormat('F2', '.', 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc'))]) original_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) schema = schema_converter.generate_schema_from_header_fields( original_header, processed_variant.ProcessedVariantFactory(original_header)) reconstructed_header = ( schema_converter.generate_header_fields_from_schema( schema)) self.assertEqual(original_header, reconstructed_header)
def test_info_and_format_header_fields(self): infos = OrderedDict([ ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))]) # GT and PS should not be set as they're already included in special # 'genotype' and 'phaseset' fields. formats = OrderedDict([ ('F1', createFormat('F1', 1, 'String', 'desc')), ('F2', createFormat('F2', 2, 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc')), ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')), ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1', 'F2', 'FU'], info_fields=['I1']), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args( argv, _COMMAND_LINE_OPTIONS) if known_args.auto_flags_experiment: _get_input_dimensions(known_args, pipeline_args) annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args) all_patterns = ([annotated_vcf_pattern] if annotated_vcf_pattern else known_args.all_patterns) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) avro_root_path = _get_avro_root_path(beam_pipeline_options) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path, annotated_vcf_pattern) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.allow_malformed_records, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, known_args.infer_annotation_types, counter_factory) schema = schema_converter.generate_schema_from_header_fields( header_fields, processed_variant_factory, variant_merger, known_args.use_1_based_coordinate, known_args.include_call_name) sharding = variant_sharding.VariantSharding( known_args.sharding_config_path) if sharding.should_keep_shard(sharding.get_residual_index()): num_shards = sharding.get_num_shards() else: num_shards = sharding.get_num_shards() - 1 if known_args.update_schema_on_append: for i in range(num_shards): table_suffix = sharding.get_output_table_suffix(i) table_name = bigquery_util.compose_table_name( known_args.output_table, table_suffix) bigquery_util.update_bigquery_schema_on_append( schema.fields, table_name) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants( all_patterns, pipeline, known_args, pipeline_mode, use_1_based_coordinate=known_args.use_1_based_coordinate) if known_args.allow_malformed_records: variants |= 'DropMalformedRecords' >> filter_variants.FilterVariants() sharded_variants = variants | 'ShardVariants' >> beam.Partition( shard_variants.ShardVariants(sharding), sharding.get_num_shards()) variants = [] for i in range(num_shards): suffix = sharding.get_output_table_suffix(i) # Convert tuples to list variants.append(sharded_variants[i]) if variant_merger: variants[i] |= ('MergeVariants' + suffix >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVariants' + suffix >> beam.Map(processed_variant_factory.create_processed_variant). \ with_output_types(processed_variant.ProcessedVariant)) _ = (variants[i] | 'VariantToAvro' + suffix >> variant_to_avro.VariantToAvroFiles( avro_root_path + suffix, schema, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, null_numeric_value_replacement=( known_args.null_numeric_value_replacement), include_call_name=known_args.include_call_name)) result = pipeline.run() try: state = result.wait_until_finish() if state != beam.runners.runner.PipelineState.DONE: logging.error( 'Dataflow pipeline terminated in an unexpected state: %s', state) raise AssertionError( 'Dataflow pipeline terminated in {} state'.format(state)) except Exception as e: logging.error('Dataflow pipeline failed.') raise e else: logging.info('Dataflow pipeline finished successfully.') metrics_util.log_all_counters(result) # After pipeline is done, create output tables and load AVRO files into them. schema_file = _write_schema_to_temp_file(schema, avro_root_path) suffixes = [] try: for i in range(num_shards): suffixes.append(sharding.get_output_table_suffix(i)) partition_range_end = sharding.get_output_table_partition_range_end( i) if not known_args.append: table_name = bigquery_util.compose_table_name( known_args.output_table, suffixes[i]) partitioning.create_bq_table( table_name, schema_file, bigquery_util.ColumnKeyConstants.START_POSITION, partition_range_end) _record_newly_created_table(table_name) logging.info('Integer range partitioned table %s was created.', table_name) if not known_args.append: _record_newly_created_table( sample_info_table_schema_generator.create_sample_info_table( known_args.output_table)) suffixes.append( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) load_avro = avro_util.LoadAvro(avro_root_path, known_args.output_table, suffixes, False) not_empty_variant_suffixes = load_avro.start_loading() logging.info('Following tables were loaded with at least 1 row:') for suffix in not_empty_variant_suffixes: logging.info( bigquery_util.compose_table_name(known_args.output_table, suffix)) # Remove sample_info table from both lists to avoid duplicating it when # --sample_lookup_optimized_output_table flag is set suffixes.remove( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) if sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX in\ not_empty_variant_suffixes: not_empty_variant_suffixes.remove( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) except Exception as e: logging.error( 'Something unexpected happened during the loading of AVRO ' 'files to BigQuery: %s', str(e)) logging.info( 'Since the write to BigQuery stage failed, we did not delete ' 'AVRO files in your GCS bucket. You can manually import them ' 'to BigQuery. To avoid extra storage charges, delete them if ' 'you do not need them, AVRO files are located at: %s', avro_root_path) raise e else: logging.warning('All AVRO files were successfully loaded to BigQuery.') if known_args.keep_intermediate_avro_files: logging.info( 'Since "--keep_intermediate_avro_files" flag is set, the ' 'AVRO files are kept and stored at: %s', avro_root_path) else: if bigquery_util.delete_gcs_files(avro_root_path) != 0: logging.error( 'Deletion of intermediate AVRO files located at "%s" has ' 'failed.', avro_root_path) if known_args.sample_lookup_optimized_output_table: flatten_call_column = partitioning.FlattenCallColumn( known_args.output_table, not_empty_variant_suffixes, known_args.append) try: flatten_schema_file = tempfile.mkstemp( suffix=_BQ_SCHEMA_FILE_SUFFIX)[1] if not flatten_call_column.get_flatten_table_schema( flatten_schema_file): raise ValueError('Failed to extract schema of flatten table') # Create output flatten tables if needed if not known_args.append: # Create all sample optimized tables including those that will be empty. for suffix in suffixes: output_table_id = bigquery_util.compose_table_name( known_args.sample_lookup_optimized_output_table, suffix) partitioning.create_bq_table( output_table_id, flatten_schema_file, bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID, partitioning.MAX_RANGE_END) _record_newly_created_table(output_table_id) logging.info( 'Sample lookup optimized table %s was created.', output_table_id) # Copy to flatten sample lookup tables from the variant lookup tables. # Note: uses WRITE_TRUNCATE to overwrite the existing tables (issue #607). flatten_call_column.copy_to_flatten_table( known_args.sample_lookup_optimized_output_table) logging.info( 'All sample lookup optimized tables are fully loaded.') except Exception as e: logging.error( 'Something unexpected happened during the loading rows to ' 'sample optimized table stage: %s', str(e)) raise e
def __init__( self, output_table, # type: str header_fields, # type: vcf_header_io.VcfHeader variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy proc_var_factory=None, # type: processed_variant.ProcessedVariantFactory # TODO(bashir2): proc_var_factory is a required argument and if `None` is # supplied this will fail in schema generation. append=False, # type: bool update_schema_on_append=False, # type: bool allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool num_bigquery_write_shards=1, # type: int null_numeric_value_replacement=None # type: int ): # type: (...) -> None """Initializes the transform. Args: output_table: Full path of the output BigQuery table. header_fields: Representative header fields for all variants. This is needed for dynamically generating the schema. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. append: If true, existing records in output_table will not be overwritten. New records will be appended to those that already exist. update_schema_on_append: If true, BigQuery schema will be updated by combining the existing schema and the new schema if they are compatible. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. num_bigquery_write_shards: If > 1, we will limit number of sources which are used for writing to the output BigQuery table. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_table = output_table self._header_fields = header_fields self._variant_merger = variant_merger self._proc_var_factory = proc_var_factory self._append = append self._schema = (schema_converter.generate_schema_from_header_fields( self._header_fields, self._proc_var_factory, self._variant_merger)) # Resolver makes extra effort to resolve conflict when flag # allow_incompatible_records is set. self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(self._schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls self._num_bigquery_write_shards = num_bigquery_write_shards if update_schema_on_append: bigquery_util.update_bigquery_schema_on_append( self._schema.fields, self._output_table)