def test_config_failed_missing_partition_name(self): tempdir = temp_dir.TempDir() missing_par_name = [ '- partition:', ' regions:', ' - "chr1:0-1,000,000"', ] with self.assertRaisesRegexp( ValueError, 'Each partition must have partition_name field.'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(missing_par_name))) empty_par_name = [ '- partition:', ' partition_name: " "', ' regions:', ' - "chr1:0-1,000,000"', ] with self.assertRaisesRegexp( ValueError, 'Partition name can not be empty string.'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(empty_par_name)))
def test_auto_partitioning(self): partitioner = variant_partition.VariantPartition() self.assertTrue(partitioner.should_flatten()) self.assertEqual(partitioner.get_num_partitions(), variant_partition._DEFAULT_NUM_PARTITIONS) # Checking standard reference_name formatted as: 'chr[0-9][0-9]' for i in xrange(variant_partition._RESERVED_AUTO_PARTITIONS): self.assertEqual(partitioner.get_partition('chr' + str(i + 1)), i) # Checking standard reference_name formatted as: '[0-9][0-9]' for i in xrange(variant_partition._RESERVED_AUTO_PARTITIONS): self.assertEqual(partitioner.get_partition(str(i + 1)), i) # Every other reference_name will be assigned to partitions >= 22 self.assertGreaterEqual(partitioner.get_partition('chrY'), variant_partition._RESERVED_AUTO_PARTITIONS) self.assertGreaterEqual(partitioner.get_partition('chrX'), variant_partition._RESERVED_AUTO_PARTITIONS) self.assertGreaterEqual(partitioner.get_partition('chrM'), variant_partition._RESERVED_AUTO_PARTITIONS) self.assertGreaterEqual(partitioner.get_partition('chr23'), variant_partition._RESERVED_AUTO_PARTITIONS) self.assertGreaterEqual(partitioner.get_partition('chr30'), variant_partition._RESERVED_AUTO_PARTITIONS) self.assertGreaterEqual(partitioner.get_partition('Unknown'), variant_partition._RESERVED_AUTO_PARTITIONS) # Expected empty string as partition_name as we are in auto mode. self.assertEqual(partitioner.get_partition_name(0), None) self.assertEqual(partitioner.get_partition_name(100), None)
def test_config_non_existent_partition_name(self): partitioner = variant_partition.VariantPartition( 'gcp_variant_transforms/testing/data/partition_configs/' 'residual_at_end.yaml') self.assertFalse(partitioner.should_flatten()) self.assertEqual(partitioner.get_num_partitions(), 8) with self.assertRaisesRegexp( ValueError, 'Given partition index -1 is outside of expected range*'): partitioner.get_partition_name(-1) with self.assertRaisesRegexp( ValueError, 'Given partition index 8 is outside of expected range*'): partitioner.get_partition_name(8)
def test_config_boundaries(self): partitioner = variant_partition.VariantPartition( 'gcp_variant_transforms/testing/data/partition_configs/' 'residual_at_end.yaml') self.assertFalse(partitioner.should_flatten()) self.assertEqual(partitioner.get_num_partitions(), 8) for i in range(partitioner.get_num_partitions()): self.assertTrue(partitioner.should_keep_partition(i)) # 'chr1:0-1,000,000' self.assertEqual(partitioner.get_partition('chr1', 0), 0) self.assertEqual(partitioner.get_partition('chr1', 999999), 0) # 'chr1:1,000,000-2,000,000' self.assertEqual(partitioner.get_partition('chr1', 1000000), 1) self.assertEqual(partitioner.get_partition('chr1', 1999999), 1) # 'chr1:2,000,000-999,999,999' self.assertEqual(partitioner.get_partition('chr1', 2000000), 2) self.assertEqual(partitioner.get_partition('chr1', 999999998), 2) self.assertEqual(partitioner.get_partition('chr1', 999999999), 7) # 'chr2' OR 'chr2_alternate_name1' OR 'chr2_alternate_name2' OR '2'. self.assertEqual(partitioner.get_partition('chr2', 0), 3) self.assertEqual(partitioner.get_partition('chr2', 999999999000), 3) self.assertEqual( partitioner.get_partition('chr2_alternate_name1', 0), 3) self.assertEqual( partitioner.get_partition('chr2_alternate_name1', 999999999000), 3) self.assertEqual(partitioner.get_partition('chr2_alternate_name2', 0), 3) self.assertEqual( partitioner.get_partition('CHR2_ALTERNATE_NAME2', 999999999000), 3) self.assertEqual(partitioner.get_partition('2', 0), 3) self.assertEqual(partitioner.get_partition('2', 999999999000), 3) # 'chr4' OR 'chr5' OR 'chr6:1,000,000-2,000,000' self.assertEqual(partitioner.get_partition('chr4', 0), 4) self.assertEqual(partitioner.get_partition('chr4', 999999999000), 4) self.assertEqual(partitioner.get_partition('chr5', 0), 4) self.assertEqual(partitioner.get_partition('chr5', 999999999000), 4) self.assertEqual(partitioner.get_partition('chr6', 1000000), 4) self.assertEqual(partitioner.get_partition('chr6', 2000000 - 1), 4) self.assertEqual(partitioner.get_partition('chr6', 0), 7) self.assertEqual(partitioner.get_partition('chr6', 999999), 7) self.assertEqual(partitioner.get_partition('chr6', 2000000), 7) # '3:0-500,000' self.assertEqual(partitioner.get_partition('3', 0), 5) self.assertEqual(partitioner.get_partition('3', 499999), 5) # '3:500,000-1,000,000' self.assertEqual(partitioner.get_partition('3', 500000), 6) self.assertEqual(partitioner.get_partition('3', 999999), 6) self.assertEqual(partitioner.get_partition('3', 1000000), 7)
def test_auto_partitioning_invalid_partitions(self): partitioner = variant_partition.VariantPartition() self.assertTrue(partitioner.should_flatten()) self.assertEqual(partitioner.get_num_partitions(), variant_partition._DEFAULT_NUM_PARTITIONS) with self.assertRaisesRegexp(ValueError, 'Cannot partition given input*'): partitioner.get_partition('chr1', -1) with self.assertRaisesRegexp(ValueError, 'Cannot partition given input*'): partitioner.get_partition('', 1) with self.assertRaisesRegexp(ValueError, 'Cannot partition given input*'): partitioner.get_partition(' ', 1)
def test_config_case_insensitive(self): partitioner = variant_partition.VariantPartition( 'gcp_variant_transforms/testing/data/partition_configs/' 'residual_at_end.yaml') self.assertFalse(partitioner.should_flatten()) self.assertEqual(partitioner.get_num_partitions(), 8) for i in range(partitioner.get_num_partitions()): self.assertTrue(partitioner.should_keep_partition(i)) # 'chr1:0-1,000,000' self.assertEqual(partitioner.get_partition('chr1', 0), 0) self.assertEqual(partitioner.get_partition('Chr1', 0), 0) self.assertEqual(partitioner.get_partition('CHr1', 0), 0) self.assertEqual(partitioner.get_partition('CHR1', 0), 0)
def test_config_get_partition_name(self): partitioner = variant_partition.VariantPartition( 'gcp_variant_transforms/testing/data/partition_configs/' 'residual_at_end.yaml') self.assertFalse(partitioner.should_flatten()) self.assertEqual(partitioner.get_num_partitions(), 8) for i in range(partitioner.get_num_partitions()): self.assertTrue(partitioner.should_keep_partition(i)) self.assertEqual(partitioner.get_partition_name(0), 'chr01_part1') self.assertEqual(partitioner.get_partition_name(1), 'chr01_part2') self.assertEqual(partitioner.get_partition_name(2), 'chr01_part3') self.assertEqual(partitioner.get_partition_name(3), 'chrom02') self.assertEqual(partitioner.get_partition_name(4), 'chrom04_05_part_06') self.assertEqual(partitioner.get_partition_name(5), 'chr3_01') self.assertEqual(partitioner.get_partition_name(6), 'chr3_02') self.assertEqual(partitioner.get_partition_name(7), 'all_remaining')
def test_partition_variants(self): expected_partitions = self._get_standard_variant_partitions() expected_partitions.update(self._get_nonstandard_variant_partitions()) variants = [variant for variant_list in expected_partitions.values() for variant in variant_list] partitioner = variant_partition.VariantPartition() pipeline = TestPipeline() partitions = ( pipeline | Create(variants) | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), partitioner.get_num_partitions())) for i in xrange(partitioner.get_num_partitions()): assert_that(partitions[i], equal_to(expected_partitions.get(i, [])), label=str(i)) pipeline.run()
def test_config_failed_missing_region(self): tempdir = temp_dir.TempDir() missing_region = [ '- partition:', ' partition_name: "chr01_part1"', ' regions:', ' - "chr1:0-1,000,000"', '- partition:', ' partition_name: "all_remaining"', ' regions:', ' - "residual"', '- partition:', ' partition_name: "missing_region"', ' regions:', ] with self.assertRaisesRegexp( ValueError, 'Each partition must have at least one region.'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(missing_region)))
def test_config_failed_duplicate_table_name(self): tempdir = temp_dir.TempDir() dup_table_name = [ '- partition:', ' partition_name: "duplicate_name"', ' regions:', ' - "chr1:0-1,000,000"', '- partition:', ' partition_name: "all_remaining"', ' regions:', ' - "residual"', '- partition:', ' partition_name: "duplicate_name"', ' regions:', ' - "chr1:1,000,000-2,000,000"', ] with self.assertRaisesRegexp( ValueError, 'Partition names must be unique *'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(dup_table_name)))
def test_config_failed_duplicate_residual_partition(self): tempdir = temp_dir.TempDir() duplicate_residual = [ '- partition:', ' partition_name: "all_remaining"', ' regions:', ' - "residual"', '- partition:', ' partition_name: "chr01"', ' regions:', ' - "chr1"', '- partition:', ' partition_name: "all_remaining_2"', ' regions:', ' - "residual"', ] with self.assertRaisesRegexp( ValueError, 'There must be only one residual partition.'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(duplicate_residual)))
def test_config_residual_partition_absent(self): partitioner = variant_partition.VariantPartition( 'gcp_variant_transforms/testing/data/partition_configs/' 'residual_missing.yaml') self.assertFalse(partitioner.should_flatten()) self.assertEqual(partitioner.get_num_partitions(), 5) # All partitions excpet the last one (dummy residual) should be kept. for i in range(partitioner.get_num_partitions() - 1): self.assertTrue(partitioner.should_keep_partition(i)) self.assertFalse(partitioner.should_keep_partition(5 - 1)) # 'chr1:0-1,000,000' self.assertEqual(partitioner.get_partition('chr1', 0), 0) self.assertEqual(partitioner.get_partition('chr1', 999999), 0) # 'chr1:1,000,000-2,000,000' self.assertEqual(partitioner.get_partition('chr1', 1000000), 1) self.assertEqual(partitioner.get_partition('chr1', 1999999), 1) # 'chr2' OR 'ch2' OR 'c2' OR '2' self.assertEqual(partitioner.get_partition('chr2', 0), 2) self.assertEqual(partitioner.get_partition('chr2', 999999999000), 2) # '3:500,000-1,000,000' self.assertEqual(partitioner.get_partition('3', 500000), 3) self.assertEqual(partitioner.get_partition('3', 999999), 3) # All the followings are assigned to residual partition. self.assertEqual(partitioner.get_partition('chr1', 2000000), 4) self.assertEqual(partitioner.get_partition('chr1', 999999999), 4) self.assertEqual(partitioner.get_partition('3', 0), 4) self.assertEqual(partitioner.get_partition('3', 499999), 4) self.assertEqual(partitioner.get_partition('3', 1000000), 4) self.assertEqual(partitioner.get_partition('ch2', 0), 4) self.assertEqual(partitioner.get_partition('c2', 0), 4) self.assertEqual(partitioner.get_partition('2', 0), 4) self.assertEqual(partitioner.get_partition('c4', 0), 4) self.assertEqual(partitioner.get_partition('cr5', 0), 4) self.assertEqual(partitioner.get_partition('chr6', 0), 4)
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) # Note VepRunner creates new input files, so it should be run before any # other access to known_args.input_pattern. if known_args.run_annotation_pipeline: runner = vep_runner.create_runner_and_update_args( known_args, pipeline_args) runner.run_on_all_files() runner.wait_until_done() logging.info('Using VEP processed files: %s', known_args.input_pattern) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = vcf_to_bq_common.get_pipeline_mode( known_args.input_pattern, known_args.optimize_for_large_inputs) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, counter_factory) partitioner = None if known_args.optimize_for_large_inputs or known_args.partition_config_path: partitioner = variant_partition.VariantPartition( known_args.partition_config_path) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants(pipeline, known_args) variants |= 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) if partitioner: num_partitions = partitioner.get_num_partitions() partitioned_variants = variants | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), num_partitions) variants = [] for i in range(num_partitions): if partitioner.should_keep_partition(i): variants.append(partitioned_variants[i]) else: num_partitions -= 1 else: # By default we don't partition the data, so we have only 1 partition. num_partitions = 1 variants = [variants] for i in range(num_partitions): if variant_merger: variants[i] |= ('MergeVariants' + str(i) >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVaraints' + str(i) >> beam.Map(processed_variant_factory.create_processed_variant).\ with_output_types(processed_variant.ProcessedVariant)) if partitioner and partitioner.should_flatten(): variants = [variants | 'FlattenPartitions' >> beam.Flatten()] num_partitions = 1 for i in range(num_partitions): table_suffix = '' if partitioner and partitioner.get_partition_name(i): table_suffix = '_' + partitioner.get_partition_name(i) table_name = known_args.output_table + table_suffix _ = ( variants[i] | 'VariantToBigQuery' + table_suffix >> variant_to_bigquery.VariantToBigQuery( table_name, header_fields, variant_merger, processed_variant_factory, append=known_args.append, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, num_bigquery_write_shards=known_args.num_bigquery_write_shards) ) result = pipeline.run() result.wait_until_finish() metrics_util.log_all_counters(result)
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args(argv, _COMMAND_LINE_OPTIONS) if known_args.auto_flags_experiment: _get_input_dimensions(known_args, pipeline_args) annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args) all_patterns = ( [annotated_vcf_pattern] if annotated_vcf_pattern else known_args.all_patterns) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = pipeline_common.get_pipeline_mode( all_patterns, known_args.optimize_for_large_inputs) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode, annotated_vcf_pattern) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.allow_malformed_records, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, known_args.infer_annotation_types, counter_factory) partitioner = None if ((known_args.optimize_for_large_inputs and variant_merger) or known_args.partition_config_path): partitioner = variant_partition.VariantPartition( known_args.partition_config_path) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants(all_patterns, pipeline, known_args, pipeline_mode) variants |= 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) if partitioner: num_partitions = partitioner.get_num_partitions() partitioned_variants = variants | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), num_partitions) variants = [] for i in range(num_partitions): if partitioner.should_keep_partition(i): variants.append(partitioned_variants[i]) else: num_partitions -= 1 else: # By default we don't partition the data, so we have only 1 partition. num_partitions = 1 variants = [variants] for i in range(num_partitions): if variant_merger: variants[i] |= ('MergeVariants' + str(i) >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVariants' + str(i) >> beam.Map(processed_variant_factory.create_processed_variant).\ with_output_types(processed_variant.ProcessedVariant)) if partitioner and partitioner.should_flatten(): variants = [variants | 'FlattenPartitions' >> beam.Flatten()] num_partitions = 1 if known_args.output_table: for i in range(num_partitions): table_suffix = '' if partitioner and partitioner.get_partition_name(i): table_suffix = '_' + partitioner.get_partition_name(i) table_name = known_args.output_table + table_suffix _ = (variants[i] | 'VariantToBigQuery' + table_suffix >> variant_to_bigquery.VariantToBigQuery( table_name, header_fields, variant_merger, processed_variant_factory, append=known_args.append, update_schema_on_append=known_args.update_schema_on_append, allow_incompatible_records=known_args.allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, num_bigquery_write_shards=known_args.num_bigquery_write_shards, null_numeric_value_replacement=( known_args.null_numeric_value_replacement))) if known_args.output_avro_path: # TODO(bashir2): Add an integration test that outputs to Avro files and # also imports to BigQuery. Then import those Avro outputs using the bq # tool and verify that the two tables are identical. _ = ( variants | 'FlattenToOnePCollection' >> beam.Flatten() | 'VariantToAvro' >> variant_to_avro.VariantToAvroFiles( known_args.output_avro_path, header_fields, processed_variant_factory, variant_merger=variant_merger, allow_incompatible_records=known_args.allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, null_numeric_value_replacement=( known_args.null_numeric_value_replacement)) ) result = pipeline.run() result.wait_until_finish() metrics_util.log_all_counters(result)
def test_config_failed_overlapping_regions(self): tempdir = temp_dir.TempDir() overlapping_regions = [ '- partition:', ' partition_name: "chr01_part1"', ' regions:', ' - "chr1:0-1,000,000"', '- partition:', ' partition_name: "chr01_part2_overlapping"', ' regions:', ' - "chr1:999,999-2,000,000"', ] with self.assertRaisesRegexp( ValueError, 'Cannot add overlapping region *'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(overlapping_regions))) full_and_partial = [ '- partition:', ' partition_name: "chr01_full"', ' regions:', ' - "chr1"', '- partition:', ' partition_name: "chr01_part_overlapping"', ' regions:', ' - "chr1:1,000,000-2,000,000"', ] with self.assertRaisesRegexp( ValueError, 'Cannot add overlapping region *'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(full_and_partial))) partial_and_full = [ '- partition:', ' partition_name: "chr01_part"', ' regions:', ' - "chr1:1,000,000-2,000,000"', '- partition:', ' partition_name: "chr01_full_overlapping"', ' regions:', ' - "chr1"', ] with self.assertRaisesRegexp( ValueError, 'Cannot add overlapping region *'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(partial_and_full))) full_and_full = [ '- partition:', ' partition_name: "chr01_full"', ' regions:', ' - "chr1"', '- partition:', ' partition_name: "chr02_part"', ' regions:', ' - "chr2:1,000,000-2,000,000"', '- partition:', ' partition_name: "chr01_full_redundant"', ' regions:', ' - "chr1"', ] with self.assertRaisesRegexp( ValueError, 'Cannot add overlapping region *'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(full_and_full)))