def test_config_failed_missing_shard_name(self): tempdir = temp_dir.TempDir() missing_par_name = [ '- output_table:', ' regions:', ' - "chr1:0-1,000,000"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table_name_suffix field missing.' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(missing_par_name))) empty_par_name = [ '- output_table:', ' table_name_suffix: " "', ' regions:', ' - "chr1:0-1,000,000"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table_name_suffix can not be empty.' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(empty_par_name)))
def test_config_failed_duplicate_table_name(self): tempdir = temp_dir.TempDir() dup_table_name = [ '- output_table:', ' table_name_suffix: "duplicate_name"', ' regions:', ' - "chr1:0-1,000,000"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "all_remaining"', ' regions:', ' - "residual"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "duplicate_name"', ' regions:', ' - "chr1:1,000,000-2,000,000"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table name suffixes must be unique*' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(dup_table_name)))
def test_config_failed_duplicate_residual_shard(self): tempdir = temp_dir.TempDir() duplicate_residual = [ '- output_table:', ' table_name_suffix: "all_remaining"', ' regions:', ' - "residual"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "chr01"', ' regions:', ' - "chr1"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "all_remaining_2"', ' regions:', ' - "residual"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, there can be only one residual output*' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(duplicate_residual)))
def test_config_case_sensitive(self): sharder = variant_sharding.VariantSharding( 'gcp_variant_transforms/testing/data/sharding_configs/' 'residual_at_end.yaml') self.assertEqual(sharder.get_num_shards(), 8) for i in range(sharder.get_num_shards()): self.assertTrue(sharder.should_keep_shard(i)) # 'chr1:0-1,000,000' self.assertEqual(sharder.get_shard_index('chr1', 0), 0) self.assertEqual(sharder.get_shard_index('Chr1', 0), 7) self.assertEqual(sharder.get_shard_index('CHr1', 0), 7) self.assertEqual(sharder.get_shard_index('CHR1', 0), 7)
def test_config_non_existent_shard_name(self): sharder = variant_sharding.VariantSharding( 'gcp_variant_transforms/testing/data/sharding_configs/' 'residual_at_end.yaml') self.assertEqual(sharder.get_num_shards(), 8) with self.assertRaisesRegex( ValueError, 'Given shard index -1 is outside of expected range*'): sharder.get_output_table_suffix(-1) with self.assertRaisesRegex( ValueError, 'Given shard index 8 is outside of expected range*'): sharder.get_output_table_suffix(8)
def test_config_boundaries(self): sharder = variant_sharding.VariantSharding( 'gcp_variant_transforms/testing/data/sharding_configs/' 'residual_at_end.yaml') self.assertEqual(sharder.get_num_shards(), 8) for i in range(sharder.get_num_shards()): self.assertTrue(sharder.should_keep_shard(i)) # 'chr1:0-1,000,000' self.assertEqual(sharder.get_shard_index('chr1', 0), 0) self.assertEqual(sharder.get_shard_index('chr1', 999999), 0) # 'chr1:1,000,000-2,000,000' self.assertEqual(sharder.get_shard_index('chr1', 1000000), 1) self.assertEqual(sharder.get_shard_index('chr1', 1999999), 1) # 'chr1:2,000,000-999,999,999' self.assertEqual(sharder.get_shard_index('chr1', 2000000), 2) self.assertEqual(sharder.get_shard_index('chr1', 999999998), 2) self.assertEqual(sharder.get_shard_index('chr1', 999999999), 7) # 'chr2' OR 'chr2_alternate_name1' OR 'chr2_ALteRNate_NAME2' OR '2'. self.assertEqual(sharder.get_shard_index('chr2', 0), 3) self.assertEqual(sharder.get_shard_index('chr2', 999999999000), 3) self.assertEqual(sharder.get_shard_index('chr2_alternate_name1', 0), 3) self.assertEqual( sharder.get_shard_index('chr2_alternate_name1', 999999999000), 3) self.assertEqual(sharder.get_shard_index('chr2_ALteRNate_NAME2', 0), 3) self.assertEqual(sharder.get_shard_index('2', 0), 3) self.assertEqual(sharder.get_shard_index('2', 999999999000), 3) self.assertEqual(sharder.get_shard_index('CHR2', 0), 7) self.assertEqual(sharder.get_shard_index('chr2_alternate_name2', 0), 7) self.assertEqual(sharder.get_shard_index('CHR2_ALTERNATE_NAME2', 0), 7) # 'chr4' OR 'chr5' OR 'chr6:1,000,000-2,000,000' self.assertEqual(sharder.get_shard_index('chr4', 0), 4) self.assertEqual(sharder.get_shard_index('chr4', 999999999000), 4) self.assertEqual(sharder.get_shard_index('chr5', 0), 4) self.assertEqual(sharder.get_shard_index('chr5', 999999999000), 4) self.assertEqual(sharder.get_shard_index('chr6', 1000000), 4) self.assertEqual(sharder.get_shard_index('chr6', 2000000 - 1), 4) self.assertEqual(sharder.get_shard_index('chr6', 0), 7) self.assertEqual(sharder.get_shard_index('chr6', 999999), 7) self.assertEqual(sharder.get_shard_index('chr6', 2000000), 7) # '3:0-500,000' self.assertEqual(sharder.get_shard_index('3', 0), 5) self.assertEqual(sharder.get_shard_index('3', 499999), 5) # '3:500,000-1,000,000' self.assertEqual(sharder.get_shard_index('3', 500000), 6) self.assertEqual(sharder.get_shard_index('3', 999999), 6) self.assertEqual(sharder.get_shard_index('3', 1000000), 7)
def _validate_output_tables(self, client, output_table_base_name, sharding_config_path, append, is_main_output): if (output_table_base_name != bigquery_util.get_table_base_name(output_table_base_name)): raise ValueError( ('Output table cannot contain "{}". we reserve this ' 'string to mark sharded output tables.').format( bigquery_util.TABLE_SUFFIX_SEPARATOR)) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( output_table_base_name) bigquery_util.raise_error_if_dataset_not_exists( client, project_id, dataset_id) all_output_tables = [] if is_main_output: all_output_tables.append( bigquery_util.compose_table_name(table_id, SAMPLE_INFO_TABLE_SUFFIX)) sharding = variant_sharding.VariantSharding(sharding_config_path) num_shards = sharding.get_num_shards() # In case there is no residual in config we will ignore the last shard. if not sharding.should_keep_shard(sharding.get_residual_index()): num_shards -= 1 for i in range(num_shards): table_suffix = sharding.get_output_table_suffix(i) if table_suffix != bigquery_util.get_table_base_name(table_suffix): raise ValueError( ('Table suffix cannot contain "{}" we reserve this ' 'string to mark sharded output tables.').format( bigquery_util.TABLE_SUFFIX_SEPARATOR)) all_output_tables.append( bigquery_util.compose_table_name(table_id, table_suffix)) for output_table in all_output_tables: if append: if not bigquery_util.table_exist(client, project_id, dataset_id, output_table): raise ValueError( 'Table {}:{}.{} does not exist, cannot append to it.'. format(project_id, dataset_id, output_table)) else: if bigquery_util.table_exist(client, project_id, dataset_id, output_table): raise ValueError(( 'Table {}:{}.{} already exists, cannot overwrite it. Please ' 'set `--append True` if you want to append to it.' ).format(project_id, dataset_id, output_table))
def test_config_get_output_table_suffix(self): sharder = variant_sharding.VariantSharding( 'gcp_variant_transforms/testing/data/sharding_configs/' 'residual_at_end.yaml') self.assertEqual(sharder.get_num_shards(), 8) for i in range(sharder.get_num_shards()): self.assertTrue(sharder.should_keep_shard(i)) self.assertEqual(sharder.get_output_table_suffix(0), 'chr01_part1') self.assertEqual(sharder.get_output_table_suffix(1), 'chr01_part2') self.assertEqual(sharder.get_output_table_suffix(2), 'chr01_part3') self.assertEqual(sharder.get_output_table_suffix(3), 'chrom02') self.assertEqual(sharder.get_output_table_suffix(4), 'chrom04_05_part_06') self.assertEqual(sharder.get_output_table_suffix(5), 'chr3_01') self.assertEqual(sharder.get_output_table_suffix(6), 'chr3_02') self.assertEqual(sharder.get_output_table_suffix(7), 'all_remaining')
def test_shard_variants(self): expected_shards = self._get_expected_variant_shards() variants = [variant for variant_list in expected_shards.values() for variant in variant_list] sharding = variant_sharding.VariantSharding( 'gcp_variant_transforms/data/sharding_configs/' 'homo_sapiens_default.yaml') pipeline = TestPipeline() shards = ( pipeline | Create(variants, reshuffle=False) | 'ShardVariants' >> beam.Partition( shard_variants.ShardVariants(sharding), sharding.get_num_shards())) for i in range(sharding.get_num_shards()): assert_that(shards[i], equal_to(expected_shards.get(i, [])), label=str(i)) pipeline.run()
def test_config_residual_shard_absent(self): sharder = variant_sharding.VariantSharding( 'gcp_variant_transforms/testing/data/sharding_configs/' 'residual_missing.yaml') self.assertEqual(sharder.get_num_shards(), 5) # All shards excpet the last one (dummy residual) should be kept. for i in range(sharder.get_num_shards() - 1): self.assertTrue(sharder.should_keep_shard(i)) self.assertFalse(sharder.should_keep_shard(5 - 1)) # 'chr1:0-1,000,000' self.assertEqual(sharder.get_shard_index('chr1', 0), 0) self.assertEqual(sharder.get_shard_index('chr1', 999999), 0) # 'chr1:1,000,000-2,000,000' self.assertEqual(sharder.get_shard_index('chr1', 1000000), 1) self.assertEqual(sharder.get_shard_index('chr1', 1999999), 1) # 'chr2' OR 'ch2' OR 'c2' OR '2' self.assertEqual(sharder.get_shard_index('chr2', 0), 2) self.assertEqual(sharder.get_shard_index('chr2', 999999999000), 2) # '3:500,000-1,000,000' self.assertEqual(sharder.get_shard_index('3', 500000), 3) self.assertEqual(sharder.get_shard_index('3', 999999), 3) # All the followings are assigned to residual shard. self.assertEqual(sharder.get_shard_index('chr1', 2000000), 4) self.assertEqual(sharder.get_shard_index('chr1', 999999999), 4) self.assertEqual(sharder.get_shard_index('cHr1', 0), 4) self.assertEqual(sharder.get_shard_index('CHR1', 0), 4) self.assertEqual(sharder.get_shard_index('3', 0), 4) self.assertEqual(sharder.get_shard_index('3', 499999), 4) self.assertEqual(sharder.get_shard_index('3', 1000000), 4) self.assertEqual(sharder.get_shard_index('ch2', 0), 4) self.assertEqual(sharder.get_shard_index('c2', 0), 4) self.assertEqual(sharder.get_shard_index('2', 0), 4) self.assertEqual(sharder.get_shard_index('c4', 0), 4) self.assertEqual(sharder.get_shard_index('cr5', 0), 4) self.assertEqual(sharder.get_shard_index('chr6', 0), 4)
def test_config_residual_shard_in_middle(self): sharder = variant_sharding.VariantSharding( 'gcp_variant_transforms/testing/data/sharding_configs/' 'residual_in_middle.yaml') self.assertEqual(sharder.get_num_shards(), 5) for i in range(sharder.get_num_shards()): self.assertTrue(sharder.should_keep_shard(i)) # 'chr1:0-1,000,000' self.assertEqual(sharder.get_shard_index('chr1', 0), 0) self.assertEqual(sharder.get_shard_index('chr1', 999999), 0) # 'chr1:1,000,000-2,000,000' self.assertEqual(sharder.get_shard_index('chr1', 1000000), 2) self.assertEqual(sharder.get_shard_index('chr1', 1999999), 2) # 'chr2' OR 'ch2' OR 'c2' OR '2' self.assertEqual(sharder.get_shard_index('chr2', 0), 3) self.assertEqual(sharder.get_shard_index('chr2', 999999999000), 3) # '3:500,000-1,000,000' self.assertEqual(sharder.get_shard_index('3', 500000), 4) self.assertEqual(sharder.get_shard_index('3', 999999), 4) # All the followings are assigned to residual shard. self.assertEqual(sharder.get_shard_index('chr1', 2000000), 1) self.assertEqual(sharder.get_shard_index('chr1', 999999999), 1) self.assertEqual(sharder.get_shard_index('cHr1', 0), 1) self.assertEqual(sharder.get_shard_index('CHR1', 0), 1) self.assertEqual(sharder.get_shard_index('3', 0), 1) self.assertEqual(sharder.get_shard_index('3', 499999), 1) self.assertEqual(sharder.get_shard_index('3', 1000000), 1) self.assertEqual(sharder.get_shard_index('ch2', 0), 1) self.assertEqual(sharder.get_shard_index('c2', 0), 1) self.assertEqual(sharder.get_shard_index('2', 0), 1) self.assertEqual(sharder.get_shard_index('c4', 0), 1) self.assertEqual(sharder.get_shard_index('cr5', 0), 1) self.assertEqual(sharder.get_shard_index('chr6', 0), 1)
def test_config_failed_missing_region(self): tempdir = temp_dir.TempDir() missing_region = [ '- output_table:', ' table_name_suffix: "chr01_part1"', ' regions:', ' - "chr1:0-1,000,000"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "all_remaining"', ' regions:', ' - "residual"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "missing_region"', ' regions:', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions field missing.'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(missing_region)))
def test_config_get_partition_range_end(self): sharder = variant_sharding.VariantSharding( 'gcp_variant_transforms/testing/data/sharding_configs/' 'residual_at_end.yaml') self.assertEqual(sharder.get_num_shards(), 8) for i in range(sharder.get_num_shards()): self.assertTrue(sharder.should_keep_shard(i)) self.assertEqual(sharder.get_output_table_partition_range_end(0), 1000000) self.assertEqual(sharder.get_output_table_partition_range_end(1), 2000000) self.assertEqual(sharder.get_output_table_partition_range_end(2), 249240615) self.assertEqual(sharder.get_output_table_partition_range_end(3), 243189284) self.assertEqual(sharder.get_output_table_partition_range_end(4), 191044274) self.assertEqual(sharder.get_output_table_partition_range_end(5), 500000) self.assertEqual(sharder.get_output_table_partition_range_end(6), 1000000) self.assertEqual(sharder.get_output_table_partition_range_end(7), 249240615)
def test_config_failed_wrong_fields(self): tempdir = temp_dir.TempDir() empty_suffix = [ '- output_table:', ' table_name_suffix: " "', ' regions:', ' - "chr1"', ' - "1"', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table_name_suffix can not be empty.' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(empty_suffix))) tempdir = temp_dir.TempDir() wrong_table_name = [ '- output_table:', ' table_name_suffix: "chr#"', ' regions:', ' - "chr1"', ' - "1"', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, BigQuery table name can only contain *' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(wrong_table_name))) tempdir = temp_dir.TempDir() duplicate_suffix = [ '- output_table:', ' table_name_suffix: "chr1"', ' regions:', ' - "chr1"', ' partition_range_end: 249240615', '- output_table:', ' table_name_suffix: "chr1"', ' regions:', ' - "chr2"', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table name suffixes must be unique*' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(duplicate_suffix))) tempdir = temp_dir.TempDir() empty_chrom_value = [ '- output_table:', ' table_name_suffix: "chr1"', ' regions:', ' - "chr1"', ' - " "', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, reference_name can not be empty string: ' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(empty_chrom_value))) tempdir = temp_dir.TempDir() duplicate_chrom_value1 = [ '- output_table:', ' table_name_suffix: "chr1"', ' regions:', ' - "dup_value"', ' - "dup_value"', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique in config file: *' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file( suffix='.yaml', lines='\n'.join(duplicate_chrom_value1))) duplicate_chrom_value2 = [ '- output_table:', ' table_name_suffix: "chr1"', ' regions:', ' - "dup_value"', ' partition_range_end: 249240615', '- output_table:', ' table_name_suffix: "chr2"', ' regions:', ' - "dup_value"', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique in config file: *' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file( suffix='.yaml', lines='\n'.join(duplicate_chrom_value2))) duplicate_residual = [ '- output_table:', ' table_name_suffix: "residual1"', ' regions:', ' - "residual"', ' partition_range_end: 249240615', '- output_table:', ' table_name_suffix: "residual2"', ' regions:', ' - "residual"', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, there can be only one residual output *' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(duplicate_residual))) not_int_partition_range_end = [ '- output_table:', ' table_name_suffix: "chr1"', ' regions:', ' - "chr1"', ' - "1"', ' partition_range_end: "not int"' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, each output table needs an integer for *' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file( suffix='.yaml', lines='\n'.join(not_int_partition_range_end))) not_pos_partition_range_end = [ '- output_table:', ' table_name_suffix: "chr1"', ' regions:', ' - "chr1"', ' - "1"', ' partition_range_end: -10' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, each output table needs an integer for *' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file( suffix='.yaml', lines='\n'.join(not_pos_partition_range_end)))
def test_config_failed_missing_fields(self): tempdir = temp_dir.TempDir() missing_output_table = [ '- missing__output_table:', ' table_name_suffix: "chr1"', ' regions:', ' - "chr1"', ' - "1"', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharing config file, output_table field missing.'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file( suffix='.yaml', lines='\n'.join(missing_output_table))) missing_table_name_suffix = [ '- output_table:', ' regions:', ' - "chr1"', ' - "1"', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table_name_suffix field missing.' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file( suffix='.yaml', lines='\n'.join(missing_table_name_suffix))) missing_chrom_values = [ '- output_table:', ' table_name_suffix: "chr1"', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions field missing.'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file( suffix='.yaml', lines='\n'.join(missing_chrom_values))) missing_filters = [ '- output_table:', ' table_name_suffix: "chr1"', ' regions:', ' partition_range_end: 249240615' ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions field missing.'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(missing_filters))) missing_partition_range_end = [ '- output_table:', ' table_name_suffix: "chr1"', ' regions:', ' - "chr1"', ' - "1"', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, partition_range_end field missing.' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file( suffix='.yaml', lines='\n'.join(missing_partition_range_end)))
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args( argv, _COMMAND_LINE_OPTIONS) if known_args.auto_flags_experiment: _get_input_dimensions(known_args, pipeline_args) annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args) all_patterns = ([annotated_vcf_pattern] if annotated_vcf_pattern else known_args.all_patterns) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) avro_root_path = _get_avro_root_path(beam_pipeline_options) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path, annotated_vcf_pattern) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.allow_malformed_records, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, known_args.infer_annotation_types, counter_factory) schema = schema_converter.generate_schema_from_header_fields( header_fields, processed_variant_factory, variant_merger, known_args.use_1_based_coordinate, known_args.include_call_name) sharding = variant_sharding.VariantSharding( known_args.sharding_config_path) if sharding.should_keep_shard(sharding.get_residual_index()): num_shards = sharding.get_num_shards() else: num_shards = sharding.get_num_shards() - 1 if known_args.update_schema_on_append: for i in range(num_shards): table_suffix = sharding.get_output_table_suffix(i) table_name = bigquery_util.compose_table_name( known_args.output_table, table_suffix) bigquery_util.update_bigquery_schema_on_append( schema.fields, table_name) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants( all_patterns, pipeline, known_args, pipeline_mode, use_1_based_coordinate=known_args.use_1_based_coordinate) if known_args.allow_malformed_records: variants |= 'DropMalformedRecords' >> filter_variants.FilterVariants() sharded_variants = variants | 'ShardVariants' >> beam.Partition( shard_variants.ShardVariants(sharding), sharding.get_num_shards()) variants = [] for i in range(num_shards): suffix = sharding.get_output_table_suffix(i) # Convert tuples to list variants.append(sharded_variants[i]) if variant_merger: variants[i] |= ('MergeVariants' + suffix >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVariants' + suffix >> beam.Map(processed_variant_factory.create_processed_variant). \ with_output_types(processed_variant.ProcessedVariant)) _ = (variants[i] | 'VariantToAvro' + suffix >> variant_to_avro.VariantToAvroFiles( avro_root_path + suffix, schema, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, null_numeric_value_replacement=( known_args.null_numeric_value_replacement), include_call_name=known_args.include_call_name)) result = pipeline.run() try: state = result.wait_until_finish() if state != beam.runners.runner.PipelineState.DONE: logging.error( 'Dataflow pipeline terminated in an unexpected state: %s', state) raise AssertionError( 'Dataflow pipeline terminated in {} state'.format(state)) except Exception as e: logging.error('Dataflow pipeline failed.') raise e else: logging.info('Dataflow pipeline finished successfully.') metrics_util.log_all_counters(result) # After pipeline is done, create output tables and load AVRO files into them. schema_file = _write_schema_to_temp_file(schema, avro_root_path) suffixes = [] try: for i in range(num_shards): suffixes.append(sharding.get_output_table_suffix(i)) partition_range_end = sharding.get_output_table_partition_range_end( i) if not known_args.append: table_name = bigquery_util.compose_table_name( known_args.output_table, suffixes[i]) partitioning.create_bq_table( table_name, schema_file, bigquery_util.ColumnKeyConstants.START_POSITION, partition_range_end) _record_newly_created_table(table_name) logging.info('Integer range partitioned table %s was created.', table_name) if not known_args.append: _record_newly_created_table( sample_info_table_schema_generator.create_sample_info_table( known_args.output_table)) suffixes.append( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) load_avro = avro_util.LoadAvro(avro_root_path, known_args.output_table, suffixes, False) not_empty_variant_suffixes = load_avro.start_loading() logging.info('Following tables were loaded with at least 1 row:') for suffix in not_empty_variant_suffixes: logging.info( bigquery_util.compose_table_name(known_args.output_table, suffix)) # Remove sample_info table from both lists to avoid duplicating it when # --sample_lookup_optimized_output_table flag is set suffixes.remove( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) if sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX in\ not_empty_variant_suffixes: not_empty_variant_suffixes.remove( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) except Exception as e: logging.error( 'Something unexpected happened during the loading of AVRO ' 'files to BigQuery: %s', str(e)) logging.info( 'Since the write to BigQuery stage failed, we did not delete ' 'AVRO files in your GCS bucket. You can manually import them ' 'to BigQuery. To avoid extra storage charges, delete them if ' 'you do not need them, AVRO files are located at: %s', avro_root_path) raise e else: logging.warning('All AVRO files were successfully loaded to BigQuery.') if known_args.keep_intermediate_avro_files: logging.info( 'Since "--keep_intermediate_avro_files" flag is set, the ' 'AVRO files are kept and stored at: %s', avro_root_path) else: if bigquery_util.delete_gcs_files(avro_root_path) != 0: logging.error( 'Deletion of intermediate AVRO files located at "%s" has ' 'failed.', avro_root_path) if known_args.sample_lookup_optimized_output_table: flatten_call_column = partitioning.FlattenCallColumn( known_args.output_table, not_empty_variant_suffixes, known_args.append) try: flatten_schema_file = tempfile.mkstemp( suffix=_BQ_SCHEMA_FILE_SUFFIX)[1] if not flatten_call_column.get_flatten_table_schema( flatten_schema_file): raise ValueError('Failed to extract schema of flatten table') # Create output flatten tables if needed if not known_args.append: # Create all sample optimized tables including those that will be empty. for suffix in suffixes: output_table_id = bigquery_util.compose_table_name( known_args.sample_lookup_optimized_output_table, suffix) partitioning.create_bq_table( output_table_id, flatten_schema_file, bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID, partitioning.MAX_RANGE_END) _record_newly_created_table(output_table_id) logging.info( 'Sample lookup optimized table %s was created.', output_table_id) # Copy to flatten sample lookup tables from the variant lookup tables. # Note: uses WRITE_TRUNCATE to overwrite the existing tables (issue #607). flatten_call_column.copy_to_flatten_table( known_args.sample_lookup_optimized_output_table) logging.info( 'All sample lookup optimized tables are fully loaded.') except Exception as e: logging.error( 'Something unexpected happened during the loading rows to ' 'sample optimized table stage: %s', str(e)) raise e
def test_config_failed_overlapping_regions(self): tempdir = temp_dir.TempDir() overlapping_regions = [ '- output_table:', ' table_name_suffix: "chr01_part1"', ' regions:', ' - "chr1:0-1,000,000"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "chr01_part2_overlapping"', ' regions:', ' - "chr1:999,999-2,000,000"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique*'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(overlapping_regions))) full_and_partial = [ '- output_table:', ' table_name_suffix: "chr01_full"', ' regions:', ' - "chr1"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "chr01_part_overlapping"', ' regions:', ' - "chr1:1,000,000-2,000,000"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique*'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(full_and_partial))) partial_and_full = [ '- output_table:', ' table_name_suffix: "chr01_part"', ' regions:', ' - "chr1:1,000,000-2,000,000"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "chr01_full_overlapping"', ' regions:', ' - "chr1"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique*'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(partial_and_full))) full_and_full = [ '- output_table:', ' table_name_suffix: "chr01_full"', ' regions:', ' - "chr1"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "chr02_part"', ' regions:', ' - "chr2:1,000,000-2,000,000"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "chr01_full_redundant"', ' regions:', ' - "chr1"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique*'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(full_and_full)))