def test_dataflow_worker_jar_flag_adds_use_staged_worker_jar_experiment(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=beam_fn_api') self.default_properties.append('--dataflow_worker_jar=test.jar') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('beam_fn_api', experiments_for_job) self.assertIn('use_staged_dataflow_worker_jar', experiments_for_job)
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) # TODO: Should not subclass ParDo. Switch to PTransform as soon as # composite transforms support display data. class SpecialParDo(beam.ParDo): def __init__(self, fn, now): super(SpecialParDo, self).__init__(fn) self.fn = fn self.now = now # Make this a list to be accessible within closure def display_data(self): return {'asubcomponent': self.fn, 'a_class': SpecialParDo, 'a_time': self.now} class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} def process(self): pass now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) remote_runner.job = apiclient.Job(p.options) super(DataflowRunner, remote_runner).run(p) job_dict = json.loads(str(remote_runner.job)) steps = [step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0] step = steps[0] disp_data = step['properties']['display_data'] disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key']) nspace = SpecialParDo.__module__+ '.' expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time'}, {'type': 'STRING', 'namespace': nspace+'SpecialParDo', 'value': nspace+'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo'}, {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn', 'value': 42, 'key': 'dofn_value'}] expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key']) self.assertEqual(len(disp_data), 3) self.assertEqual(disp_data, expected_data)
def test_use_fastavro_experiment_is_not_added_when_use_avro_is_present( self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=use_avro') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned debug_options = remote_runner.job.options.view_as(DebugOptions) self.assertFalse(debug_options.lookup_experiment( 'use_fastavro', False))
def test_dataflow_worker_jar_flag_non_fnapi_noop(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=some_other_experiment') self.default_properties.append('--dataflow_worker_jar=test.jar') p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('some_other_experiment', experiments_for_job) self.assertNotIn('use_staged_dataflow_worker_jar', experiments_for_job)
def test_direct_runner_metrics(self): from apache_beam.metrics.metric import Metrics class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] runner = DirectRunner() p = Pipeline(runner, options=PipelineOptions(self.default_properties)) # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> beam.ParDo(MyDoFn())) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult( MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5)))))
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def test_streaming_engine_flag_adds_windmill_experiments(self): remote_runner = DataflowRunner() self.default_properties.append('--streaming') self.default_properties.append('--enable_streaming_engine') self.default_properties.append('--experiment=some_other_experiment') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('enable_streaming_engine', experiments_for_job) self.assertIn('enable_windmill_service', experiments_for_job) self.assertIn('some_other_experiment', experiments_for_job)
def test_environment_override_translation(self): self.default_properties.append('--experiments=beam_fn_api') self.default_properties.append('--worker_harness_container_image=FOO') remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) p.run() self.assertEqual( list(remote_runner.proto_pipeline.components.environments.values()), [beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image='FOO').SerializeToString())])
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned remote_runner.job = apiclient.Job(p._options) # Performing configured PTransform overrides here. p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) super(DataflowRunner, remote_runner).run(p) job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def test_call_names_combiner_pipeline(self): call_names = ['sample1', 'sample2', 'sample3'] variant_calls = [ vcfio.VariantCall(name=call_names[0]), vcfio.VariantCall(name=call_names[1]), vcfio.VariantCall(name=call_names[2]) ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() combined_call_names = ( pipeline | transforms.Create(variants) | 'CombineCallNames' >> combine_call_names.CallNamesCombiner()) assert_that(combined_call_names, equal_to([call_names])) pipeline.run()
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) # TODO(https://github.com/apache/beam/issues/18012) Enable runner API on # this test. p.run(test_runner_api=False) job_dict = json.loads(str(remote_runner.job)) steps = [ step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0 ] step = steps[1] disp_data = step['properties']['display_data'] nspace = SpecialParDo.__module__ + '.' expected_data = [{ 'type': 'TIMESTAMP', 'namespace': nspace + 'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time' }, { 'type': 'STRING', 'namespace': nspace + 'SpecialParDo', 'value': nspace + 'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo' }, { 'type': 'INTEGER', 'namespace': nspace + 'SpecialDoFn', 'value': 42, 'key': 'dofn_value' }] self.assertUnhashableCountEqual(disp_data, expected_data)
def test_convert_sample_info_to_row(self, mocked_obj): vcf_header_1 = vcf_header_io.VcfHeader( samples=SAMPLE_LINE, file_path='gs://bucket1/dir1/file1.vcf') vcf_header_2 = vcf_header_io.VcfHeader( samples=SAMPLE_LINE, file_path='gs://bucket1/dir1/file2.vcf') current_minute = mocked_obj() expected_rows = [ {sample_info_table_schema_generator.SAMPLE_ID: 7715696391291253656, sample_info_table_schema_generator.SAMPLE_NAME: ( 'gs___bucket1_dir1_file1_vcf_Sample1'), sample_info_table_schema_generator.FILE_PATH: ( 'gs://bucket1/dir1/file1.vcf'), sample_info_table_schema_generator.INGESTION_DATETIME: current_minute}, {sample_info_table_schema_generator.SAMPLE_ID: 5682150464643626236, sample_info_table_schema_generator.SAMPLE_NAME: ( 'gs___bucket1_dir1_file1_vcf_Sample2'), sample_info_table_schema_generator.FILE_PATH: ( 'gs://bucket1/dir1/file1.vcf'), sample_info_table_schema_generator.INGESTION_DATETIME: current_minute}, {sample_info_table_schema_generator.SAMPLE_ID: 668336000922978678, sample_info_table_schema_generator.SAMPLE_NAME: ( 'gs___bucket1_dir1_file2_vcf_Sample1'), sample_info_table_schema_generator.FILE_PATH: ( 'gs://bucket1/dir1/file2.vcf'), sample_info_table_schema_generator.INGESTION_DATETIME: current_minute}, {sample_info_table_schema_generator.SAMPLE_ID: 5498327443813165683, sample_info_table_schema_generator.SAMPLE_NAME: ( 'gs___bucket1_dir1_file2_vcf_Sample2'), sample_info_table_schema_generator.FILE_PATH: ( 'gs://bucket1/dir1/file2.vcf'), sample_info_table_schema_generator.INGESTION_DATETIME: current_minute}, ] pipeline = test_pipeline.TestPipeline() bigquery_rows = ( pipeline | transforms.Create([vcf_header_1, vcf_header_2]) | 'ConvertToRow' >> transforms.ParDo(sample_info_to_bigquery.ConvertSampleInfoToRow( SampleNameEncoding.WITH_FILE_PATH), )) assert_that(bigquery_rows, equal_to(expected_rows)) pipeline.run()
def test_call_names_combiner_pipeline_preserve_call_names_order_error( self): call_names = ['sample1', 'sample2', 'sample3'] variant_calls = [ vcfio.VariantCall(name=call_names[0]), vcfio.VariantCall(name=call_names[1]), vcfio.VariantCall(name=call_names[2]) ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() _ = (pipeline | transforms.Create(variants) | 'CombineCallNames' >> combine_call_names.CallNamesCombiner( preserve_call_names_order=True)) with self.assertRaises(ValueError): pipeline.run()
def test_sample_ids_combiner_pipeline(self): sample_ids = [ hash_name('sample3'), hash_name('sample2'), hash_name('sample1') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]) ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() combined_sample_ids = ( pipeline | transforms.Create(variants) | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner() | combiners.ToList()) assert_that(combined_sample_ids, equal_to([sample_ids])) pipeline.run()
def test_sample_ids_combiner_pipeline_preserve_sample_order_error(self): sample_ids = [ hash_name('sample1'), hash_name('sample2'), hash_name('sample3') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]) ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() _ = (pipeline | transforms.Create(variants) | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True) | combiners.ToList()) with self.assertRaises(ValueError): pipeline.run()
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the meta info and data header with the sample names to `vcf_header_file_path`. """ schema = _get_schema(known_args.input_table) variant_query = _get_variant_query(known_args, schema) logging.info('Processing BigQuery query %s:', variant_query) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( known_args.input_table) bq_variant_source = bigquery.BigQuerySource(query=variant_query, validate=True, use_standard_sql=True) annotation_names = _extract_annotation_names(schema) base_table_id = bigquery_util.get_table_base_name(table_id) sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format( PROJECT_ID=project_id, DATASET_ID=dataset_id, TABLE_NAME=bigquery_util.compose_table_name(base_table_id, SAMPLE_INFO_TABLE_SUFFIX)) bq_sample_source = bigquery.BigQuerySource(query=sample_query, validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source) | bigquery_to_variant.BigQueryToVariant(annotation_names)) sample_table_rows = ( p | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source)) if known_args.sample_names: temp_sample_names = (p | transforms.Create(known_args.sample_names, reshuffle=False)) else: # Get sample names from sample IDs in the variants and sort. id_to_name_hash_table = (sample_table_rows | 'SampleIdToNameDict' >> sample_mapping_table.SampleIdToNameDict()) temp_sample_ids = ( variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner( known_args.preserve_sample_order)) temp_sample_names = ( temp_sample_ids | 'GetSampleNames' >> sample_mapping_table.GetSampleNames( beam.pvalue.AsSingleton(id_to_name_hash_table)) | 'CombineToList' >> beam.combiners.ToList() | 'SortSampleNames' >> beam.ParDo(sorted)) name_to_id_hash_table = ( sample_table_rows | 'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict()) sample_ids = (temp_sample_names | 'GetSampleIds' >> sample_mapping_table.GetSampleIds( beam.pvalue.AsSingleton(name_to_id_hash_table)) | 'CombineSortedSampleIds' >> beam.combiners.ToList()) sample_names = temp_sample_names | beam.combiners.ToList() _ = (sample_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS, known_args.representative_header_file, header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))