def test_group_by_key_input_visitor_for_non_gbk_transforms(self): p = TestPipeline() pcoll = PCollection(p) for transform in [beam.Flatten(), beam.Map(lambda x: x)]: pcoll.element_type = typehints.Any DataflowRunner.group_by_key_input_visitor().visit_transform( AppliedPTransform(None, transform, "label", [pcoll])) self.assertEqual(pcoll.element_type, typehints.Any)
def test_serialize_windowing_strategy(self): # This just tests the basic path; more complete tests # are in window_test.py. strategy = Windowing(window.FixedWindows(10)) self.assertEqual( strategy, DataflowRunner.deserialize_windowing_strategy( DataflowRunner.serialize_windowing_strategy(strategy)))
def test_remote_runner_translation(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) remote_runner.job = apiclient.Job(p.options) super(DataflowRunner, remote_runner).run(p)
def test_group_by_key_input_visitor_with_invalid_inputs(self): p = TestPipeline() pcoll1 = PCollection(p) pcoll2 = PCollection(p) for transform in [_GroupByKeyOnly(), beam.GroupByKey()]: pcoll1.element_type = typehints.TupleSequenceConstraint pcoll2.element_type = typehints.Set err_msg = "Input to GroupByKey must be of Tuple or Any type" for pcoll in [pcoll1, pcoll2]: with self.assertRaisesRegexp(ValueError, err_msg): DataflowRunner.group_by_key_input_visitor().visit_transform( AppliedPTransform(None, transform, "label", [pcoll]))
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) # TODO: Should not subclass ParDo. Switch to PTransform as soon as # composite transforms support display data. class SpecialParDo(beam.ParDo): def __init__(self, fn, now): super(SpecialParDo, self).__init__(fn) self.fn = fn self.now = now # Make this a list to be accessible within closure def display_data(self): return {'asubcomponent': self.fn, 'a_class': SpecialParDo, 'a_time': self.now} class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} def process(self): pass now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) remote_runner.job = apiclient.Job(p.options) super(DataflowRunner, remote_runner).run(p) job_dict = json.loads(str(remote_runner.job)) steps = [step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0] step = steps[0] disp_data = step['properties']['display_data'] disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key']) nspace = SpecialParDo.__module__+ '.' expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time'}, {'type': 'STRING', 'namespace': nspace+'SpecialParDo', 'value': nspace+'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo'}, {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn', 'value': 42, 'key': 'dofn_value'}] expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key']) self.assertEqual(len(disp_data), 3) self.assertEqual(disp_data, expected_data)
def test_group_by_key_input_visitor_with_invalid_inputs(self): p = TestPipeline() pcoll1 = PCollection(p) pcoll2 = PCollection(p) for transform in [_GroupByKeyOnly(), beam.GroupByKey()]: pcoll1.element_type = str pcoll2.element_type = typehints.Set err_msg = ( r"Input to 'label' must be compatible with KV\[Any, Any\]. " "Found .*") for pcoll in [pcoll1, pcoll2]: with self.assertRaisesRegexp(ValueError, err_msg): DataflowRunner.group_by_key_input_visitor().visit_transform( AppliedPTransform(None, transform, "label", [pcoll]))
def test_group_by_key_input_visitor_with_valid_inputs(self): p = TestPipeline() pcoll1 = PCollection(p) pcoll2 = PCollection(p) pcoll3 = PCollection(p) for transform in [_GroupByKeyOnly(), beam.GroupByKey()]: pcoll1.element_type = None pcoll2.element_type = typehints.Any pcoll3.element_type = typehints.KV[typehints.Any, typehints.Any] for pcoll in [pcoll1, pcoll2, pcoll3]: DataflowRunner.group_by_key_input_visitor().visit_transform( AppliedPTransform(None, transform, "label", [pcoll])) self.assertEqual(pcoll.element_type, typehints.KV[typehints.Any, typehints.Any])
def test_side_input_visitor(self): p = TestPipeline() pc = p | beam.Create([]) transform = beam.Map( lambda x, y, z: (x, y, z), beam.pvalue.AsSingleton(pc), beam.pvalue.AsMultiMap(pc)) applied_transform = AppliedPTransform(None, transform, "label", [pc]) DataflowRunner.side_input_visitor().visit_transform(applied_transform) self.assertEqual(2, len(applied_transform.side_inputs)) for side_input in applied_transform.side_inputs: self.assertEqual( dataflow_runner._DataflowSideInput.DATAFLOW_MULTIMAP_URN, side_input._side_input_data().access_pattern)
def _test_flatten_input_visitor(self, input_type, output_type, num_inputs): p = TestPipeline() inputs = [] for _ in range(num_inputs): input_pcoll = PCollection(p) input_pcoll.element_type = input_type inputs.append(input_pcoll) output_pcoll = PCollection(p) output_pcoll.element_type = output_type flatten = AppliedPTransform(None, beam.Flatten(), "label", inputs) flatten.add_output(output_pcoll, None) DataflowRunner.flatten_input_visitor().visit_transform(flatten) for _ in range(num_inputs): self.assertEqual(inputs[0].element_type, output_type)
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned remote_runner.job = apiclient.Job(p._options) # Performing configured PTransform overrides here. p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) super(DataflowRunner, remote_runner).run(p) job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def test_gbk_then_flatten_input_visitor(self): p = TestPipeline( runner=DataflowRunner(), options=PipelineOptions(self.default_properties)) none_str_pc = p | 'c1' >> beam.Create({None: 'a'}) none_int_pc = p | 'c2' >> beam.Create({None: 3}) flat = (none_str_pc, none_int_pc) | beam.Flatten() _ = flat | beam.GroupByKey() # This may change if type inference changes, but we assert it here # to make sure the check below is not vacuous. self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint) p.visit(DataflowRunner.group_by_key_input_visitor()) p.visit(DataflowRunner.flatten_input_visitor()) # The dataflow runner requires gbk input to be tuples *and* flatten # inputs to be equal to their outputs. Assert both hold. self.assertIsInstance(flat.element_type, typehints.TupleConstraint) self.assertEqual(flat.element_type, none_str_pc.element_type) self.assertEqual(flat.element_type, none_int_pc.element_type)
def test_dataflow_worker_jar_flag_non_fnapi_noop(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=some_other_experiment') self.default_properties.append('--dataflow_worker_jar=test.jar') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('some_other_experiment', experiments_for_job) self.assertNotIn('use_staged_dataflow_worker_jar', experiments_for_job)
def test_dataflow_worker_jar_flag_adds_use_staged_worker_jar_experiment(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=beam_fn_api') self.default_properties.append('--dataflow_worker_jar=test.jar') p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('beam_fn_api', experiments_for_job) self.assertIn('use_staged_dataflow_worker_jar', experiments_for_job)
def test_biqquery_read_fn_api_fail(self): remote_runner = DataflowRunner() for flag in ['beam_fn_api', 'use_unified_worker', 'use_runner_v2']: self.default_properties.append("--experiments=%s" % flag) with self.assertRaisesRegex( ValueError, 'The Read.BigQuerySource.*is not supported.*' 'apache_beam.io.gcp.bigquery.ReadFromBigQuery.*'): with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: _ = p | beam.io.Read( beam.io.BigQuerySource( 'some.table', use_dataflow_native_source=True))
def test_no_group_by_key_directly_after_bigquery(self): remote_runner = DataflowRunner() with self.assertRaises(ValueError, msg=('Coder for the GroupByKey operation' '"GroupByKey" is not a key-value coder: ' 'RowAsDictJsonCoder')): with beam.Pipeline(runner=remote_runner, options=PipelineOptions(self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.io.Read( beam.io.BigQuerySource( 'dataset.faketable', use_dataflow_native_source=True)) | beam.GroupByKey()
def test_read_pubsub_translation(self): runner = DataflowRunner() self.default_properties.append("--streaming") with beam.Pipeline(runner=runner, options=PipelineOptions( self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.io.ReadFromPubSub(topic='projects/project/topics/topic') self.expect_correct_override(runner.job, u'ReadFromPubSub/Read', u'ParallelRead')
def test_combine_values_translation(self): runner = DataflowRunner() with beam.Pipeline(runner=runner, options=PipelineOptions(self.default_properties)) as p: ( # pylint: disable=expression-not-assigned p | beam.Create([('a', [1, 2]), ('b', [3, 4])]) | beam.CombineValues(lambda v, _: sum(v))) job_dict = json.loads(str(runner.job)) self.assertIn( u'CombineValues', set(step[u'kind'] for step in job_dict[u'steps']))
def test_use_fastavro_experiment_is_not_added_when_use_avro_is_present( self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=use_avro') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned debug_options = remote_runner.job.options.view_as(DebugOptions) self.assertFalse(debug_options.lookup_experiment( 'use_fastavro', False))
def test_read_bigquery_translation(self): runner = DataflowRunner() with beam.Pipeline(runner=runner, options=PipelineOptions( self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.io.Read( beam.io.BigQuerySource('some.table', coder=BytesCoder(), use_dataflow_native_source=True)) self.expect_correct_override(runner.job, u'Read', u'ParallelRead')
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) # TODO: Should not subclass ParDo. Switch to PTransform as soon as # composite transforms support display data. class SpecialParDo(beam.ParDo): def __init__(self, fn, now): super(SpecialParDo, self).__init__(fn) self.fn = fn self.now = now # Make this a list to be accessible within closure def display_data(self): return {'asubcomponent': self.fn, 'a_class': SpecialParDo, 'a_time': self.now} class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} def process(self): pass now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) p.run() job_dict = json.loads(str(remote_runner.job)) steps = [step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0] step = steps[1] disp_data = step['properties']['display_data'] disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key']) nspace = SpecialParDo.__module__+ '.' expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time'}, {'type': 'STRING', 'namespace': nspace+'SpecialParDo', 'value': nspace+'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo'}, {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn', 'value': 42, 'key': 'dofn_value'}] expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key']) self.assertEqual(len(disp_data), 3) self.assertEqual(disp_data, expected_data)
def test_write_bigquery_failed_translation(self): """Tests that WriteToBigQuery cannot have any consumers if replaced.""" runner = DataflowRunner() self.default_properties.append('--experiments=use_legacy_bq_sink') with self.assertRaises(Exception): with beam.Pipeline(runner=runner, options=PipelineOptions( self.default_properties)) as p: # pylint: disable=expression-not-assigned out = p | beam.Create( [1]) | beam.io.WriteToBigQuery('some.table') out['destination_file_pairs'] | 'MyTransform' >> beam.Map( lambda _: _)
def test_streaming_engine_flag_adds_windmill_experiments(self): remote_runner = DataflowRunner() self.default_properties.append('--streaming') self.default_properties.append('--enable_streaming_engine') self.default_properties.append('--experiment=some_other_experiment') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('enable_streaming_engine', experiments_for_job) self.assertIn('enable_windmill_service', experiments_for_job) self.assertIn('some_other_experiment', experiments_for_job)
def test_write_bigquery_translation(self): runner = DataflowRunner() self.default_properties.append('--experiments=use_legacy_bq_sink') with beam.Pipeline(runner=runner, options=PipelineOptions(self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.Create([1]) | beam.io.WriteToBigQuery('some.table') job_dict = json.loads(str(runner.job)) expected_step = { "kind": "ParallelWrite", "name": "s2", "properties": { "create_disposition": "CREATE_IF_NEEDED", "dataset": "some", "display_data": [], "encoding": { "@type": "kind:windowed_value", "component_encodings": [{ "component_encodings": [], "pipeline_proto_coder_id": "ref_Coder_RowAsDictJsonCoder_4" }, { "@type": "kind:global_window" }], "is_wrapper": True }, "format": "bigquery", "parallel_input": { "@type": "OutputReference", "output_name": "out", "step_name": "s1" }, "table": "table", "user_name": "WriteToBigQuery/Write/NativeWrite", "write_disposition": "WRITE_APPEND" } } job_dict = json.loads(str(runner.job)) write_step = [ s for s in job_dict[u'steps'] if s[u'properties'][u'user_name'].startswith('WriteToBigQuery') ][0] # Delete the @type field because in this case it is a hash which may change # depending on the pickling version. step_encoding = write_step[u'properties'][u'encoding'] del step_encoding[u'component_encodings'][0][u'@type'] self.assertEqual(expected_step, write_step)
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def test_no_group_by_key_directly_after_bigquery(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--no_auth' ])) rows = p | beam.io.Read(beam.io.BigQuerySource('dataset.faketable')) with self.assertRaises(ValueError, msg=('Coder for the GroupByKey operation' '"GroupByKey" is not a key-value coder: ' 'RowAsDictJsonCoder')): unused_invalid = rows | beam.GroupByKey()
def test_unsupported_fnapi_features(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=beam_fn_api') self.default_properties.append('--experiment=use_runner_v2') with self.assertRaisesRegex(RuntimeError, 'Unsupported merging'): with Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.Create([]) | beam.WindowInto(CustomMergingWindowFn()) with self.assertRaisesRegex(RuntimeError, 'Unsupported window coder'): with Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.Create([]) | beam.WindowInto(CustomWindowTypeWindowFn())
def test_environment_override_translation(self): self.default_properties.append('--experiments=beam_fn_api') self.default_properties.append('--worker_harness_container_image=FOO') remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) p.run() self.assertEqual( list(remote_runner.proto_pipeline.components.environments.values()), [beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image='FOO').SerializeToString())])
def test_unsupported_combinefn_fail(self): class CombinerWithNonDefaultSetupTeardown(combiners.CountCombineFn): def setup(self, *args, **kwargs): pass def teardown(self, *args, **kwargs): pass runner = DataflowRunner() with self.assertRaisesRegex( ValueError, 'CombineFn.setup and CombineFn.' 'teardown are not supported'): with beam.Pipeline(runner=runner, options=PipelineOptions( self.default_properties)) as p: _ = (p | beam.Create([1]) | beam.CombineGlobally( CombinerWithNonDefaultSetupTeardown()))
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) # TODO(https://github.com/apache/beam/issues/18012) Enable runner API on # this test. p.run(test_runner_api=False) job_dict = json.loads(str(remote_runner.job)) steps = [ step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0 ] step = steps[1] disp_data = step['properties']['display_data'] nspace = SpecialParDo.__module__ + '.' expected_data = [{ 'type': 'TIMESTAMP', 'namespace': nspace + 'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time' }, { 'type': 'STRING', 'namespace': nspace + 'SpecialParDo', 'value': nspace + 'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo' }, { 'type': 'INTEGER', 'namespace': nspace + 'SpecialDoFn', 'value': 42, 'key': 'dofn_value' }] self.assertUnhashableCountEqual(disp_data, expected_data)
def _run_group_into_batches_and_get_step_properties( self, with_sharded_key, additional_properties): self.default_properties.append('--streaming') for property in additional_properties: self.default_properties.append(property) runner = DataflowRunner() with beam.Pipeline(runner=runner, options=PipelineOptions(self.default_properties)) as p: # pylint: disable=expression-not-assigned input = p | beam.Create([('a', 1), ('a', 1), ('b', 3), ('b', 4)]) if with_sharded_key: ( input | beam.GroupIntoBatches.WithShardedKey(2) | beam.Map(lambda key_values: (key_values[0].key, key_values[1]))) step_name = ( u'WithShardedKey/GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)') else: input | beam.GroupIntoBatches(2) step_name = u'GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)' return self._find_step(runner.job, step_name)['properties']
def _test_pack_combiners(self, pipeline_options, expect_packed): runner = DataflowRunner() with beam.Pipeline(runner=runner, options=pipeline_options) as p: data = p | beam.Create([10, 20, 30]) _ = data | 'PackableMin' >> beam.CombineGlobally(min) _ = data | 'PackableMax' >> beam.CombineGlobally(max) unpacked_minimum_step_name = 'PackableMin/CombinePerKey/Combine' unpacked_maximum_step_name = 'PackableMax/CombinePerKey/Combine' packed_step_name = ( 'Packed[PackableMin/CombinePerKey, PackableMax/CombinePerKey]/Pack/' 'CombinePerKey(SingleInputTupleCombineFn)/Combine') job_dict = json.loads(str(runner.job)) step_names = set(s[u'properties'][u'user_name'] for s in job_dict[u'steps']) if expect_packed: self.assertNotIn(unpacked_minimum_step_name, step_names) self.assertNotIn(unpacked_maximum_step_name, step_names) self.assertIn(packed_step_name, step_names) else: self.assertIn(unpacked_minimum_step_name, step_names) self.assertIn(unpacked_maximum_step_name, step_names) self.assertNotIn(packed_step_name, step_names)
def test_resource_hints_translation(self, memory_hint): runner = DataflowRunner() self.default_properties.append('--resource_hint=accelerator=some_gpu') self.default_properties.append(f'--resource_hint={memory_hint}=20GB') with beam.Pipeline(runner=runner, options=PipelineOptions( self.default_properties)) as p: # pylint: disable=expression-not-assigned (p | beam.Create([1]) | 'MapWithHints' >> beam.Map(lambda x: x + 1).with_resource_hints( min_ram='10GB', accelerator= 'type:nvidia-tesla-k80;count:1;install-nvidia-drivers')) step = self._find_step(runner.job, 'MapWithHints') self.assertEqual( step['properties']['resource_hints'], { 'beam:resources:min_ram_bytes:v1': '20000000000', 'beam:resources:accelerator:v1': \ 'type%3Anvidia-tesla-k80%3Bcount%3A1%3Binstall-nvidia-drivers' })
def test_gbk_translation(self): runner = DataflowRunner() with beam.Pipeline(runner=runner, options=PipelineOptions( self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.Create([(1, 2)]) | beam.GroupByKey() expected_output_info = [{ "encoding": { "@type": "kind:windowed_value", "component_encodings": [{ "@type": "kind:pair", "component_encodings": [{ "@type": "kind:varint" }, { "@type": "kind:stream", "component_encodings": [{ "@type": "kind:varint" }], "is_stream_like": True }], "is_pair_like": True }, { "@type": "kind:global_window" }], "is_wrapper": True }, "output_name": "out", "user_name": "GroupByKey.out" }] # yapf: disable gbk_step = self._find_step(runner.job, u'GroupByKey') self.assertEqual(gbk_step[u'kind'], u'GroupByKey') self.assertEqual(gbk_step[u'properties']['output_info'], expected_output_info)
def run_ParDo(self, transform_node): transform = transform_node.transform output = transform_node.outputs[None] element_coder = self._get_coder(output) map_task_index, producer_index, output_index = self.outputs[ transform_node.inputs[0]] # If any of this ParDo's side inputs depend on outputs from this map_task, # we can't continue growing this map task. def is_reachable(leaf, root): if leaf == root: return True else: return any( is_reachable(x, root) for x in self.dependencies[leaf]) if any( is_reachable(self.outputs[side_input.pvalue][0], map_task_index) for side_input in transform_node.side_inputs): # Start a new map tasks. input_element_coder = self._get_coder(transform_node.inputs[0]) output_buffer = OutputBuffer(input_element_coder) fusion_break_write = operation_specs.WorkerInMemoryWrite( output_buffer=output_buffer, write_windowed_values=True, input=(producer_index, output_index), output_coders=[input_element_coder]) self.map_tasks[map_task_index].append( (transform_node.full_label + '/Write', fusion_break_write)) original_map_task_index = map_task_index map_task_index, producer_index, output_index = len( self.map_tasks), 0, 0 fusion_break_read = operation_specs.WorkerRead( output_buffer.source_bundle(), output_coders=[input_element_coder]) self.map_tasks.append([(transform_node.full_label + '/Read', fusion_break_read)]) self.dependencies[map_task_index].add(original_map_task_index) def create_side_read(side_input): label = self.side_input_labels[side_input] output_buffer = self.run_side_write( side_input.pvalue, '%s/%s' % (transform_node.full_label, label)) return operation_specs.WorkerSideInputSource( output_buffer.source(), label) do_op = operation_specs.WorkerDoFn( # serialized_fn=pickler.dumps( DataflowRunner._pardo_fn_data( transform_node, lambda side_input: self.side_input_labels[side_input])), output_tags=[PropertyNames.OUT] + [ '%s_%s' % (PropertyNames.OUT, tag) for tag in transform.output_tags ], # Same assumption that DataflowRunner has about coders being compatible # across outputs. output_coders=[element_coder] * (len(transform.output_tags) + 1), input=(producer_index, output_index), side_inputs=[ create_side_read(side_input) for side_input in transform_node.side_inputs ]) producer_index = len(self.map_tasks[map_task_index]) self.outputs[transform_node.outputs[None]] = (map_task_index, producer_index, 0) for ix, tag in enumerate(transform.output_tags): self.outputs[transform_node. outputs[tag]] = map_task_index, producer_index, ix + 1 self.map_tasks[map_task_index].append( (transform_node.full_label, do_op)) for side_input in transform_node.side_inputs: self.dependencies[map_task_index].add( self.outputs[side_input.pvalue][0])
def run_ParDo(self, transform_node): transform = transform_node.transform output = transform_node.outputs[None] element_coder = self._get_coder(output) map_task_index, producer_index, output_index = self.outputs[ transform_node.inputs[0]] # If any of this ParDo's side inputs depend on outputs from this map_task, # we can't continue growing this map task. def is_reachable(leaf, root): if leaf == root: return True else: return any(is_reachable(x, root) for x in self.dependencies[leaf]) if any(is_reachable(self.outputs[side_input.pvalue][0], map_task_index) for side_input in transform_node.side_inputs): # Start a new map tasks. input_element_coder = self._get_coder(transform_node.inputs[0]) output_buffer = OutputBuffer(input_element_coder) fusion_break_write = operation_specs.WorkerInMemoryWrite( output_buffer=output_buffer, write_windowed_values=True, input=(producer_index, output_index), output_coders=[input_element_coder]) self.map_tasks[map_task_index].append( (transform_node.full_label + '/Write', fusion_break_write)) original_map_task_index = map_task_index map_task_index, producer_index, output_index = len(self.map_tasks), 0, 0 fusion_break_read = operation_specs.WorkerRead( output_buffer.source_bundle(), output_coders=[input_element_coder]) self.map_tasks.append( [(transform_node.full_label + '/Read', fusion_break_read)]) self.dependencies[map_task_index].add(original_map_task_index) def create_side_read(side_input): label = self.side_input_labels[side_input] output_buffer = self.run_side_write( side_input.pvalue, '%s/%s' % (transform_node.full_label, label)) return operation_specs.WorkerSideInputSource( output_buffer.source(), label) do_op = operation_specs.WorkerDoFn( # serialized_fn=pickler.dumps(DataflowRunner._pardo_fn_data( transform_node, lambda side_input: self.side_input_labels[side_input])), output_tags=[PropertyNames.OUT] + ['%s_%s' % (PropertyNames.OUT, tag) for tag in transform.output_tags ], # Same assumption that DataflowRunner has about coders being compatible # across outputs. output_coders=[element_coder] * (len(transform.output_tags) + 1), input=(producer_index, output_index), side_inputs=[create_side_read(side_input) for side_input in transform_node.side_inputs]) producer_index = len(self.map_tasks[map_task_index]) self.outputs[transform_node.outputs[None]] = ( map_task_index, producer_index, 0) for ix, tag in enumerate(transform.output_tags): self.outputs[transform_node.outputs[ tag]] = map_task_index, producer_index, ix + 1 self.map_tasks[map_task_index].append((transform_node.full_label, do_op)) for side_input in transform_node.side_inputs: self.dependencies[map_task_index].add(self.outputs[side_input.pvalue][0])
def test_get_default_gcp_region_ignores_error(self, patched_environ, patched_processes): runner = DataflowRunner() result = runner.get_default_gcp_region() self.assertIsNone(result)
def test_get_default_gcp_region_from_gcloud(self, patched_environ, patched_processes): runner = DataflowRunner() result = runner.get_default_gcp_region() self.assertEqual(result, 'some-region2')
def test_get_default_gcp_region_no_default_returns_none( self, patched_environ, patched_processes): runner = DataflowRunner() result = runner.get_default_gcp_region() self.assertIsNone(result)
def _get_coder(self, pvalue, windowed=True): # TODO(robertwb): This should be an attribute of the pvalue itself. return DataflowRunner._get_coder( pvalue.element_type or typehints.Any, pvalue.windowing.windowfn.get_window_coder() if windowed else None)
windowed_avg = (windowed_data | "avg1" >> beam.CombinePerKey(beam.combiners.MeanCombineFn()) ) class PrintWindowResults(beam.DoFn): def process(self, element, window=beam.DoFn.WindowParam): new_element = element yield new_element ( windowed_sum | "sum4" >> beam.ParDo(PrintWindowResults()) | "sum5" >> beam.Map(lambda st: '{{"id": {}, "total_steps": {}}}'.format(st[0],st[1])) | "sum6" >> beam.Map(lambda z: bytes(z, "utf-8")) | "sum7" >> beam.io.WriteToPubSub(topic="projects/data228/topics/data228-hw8-out") ) ( windowed_avg | "avg4" >> beam.ParDo(PrintWindowResults()) | "avg5" >> beam.Map(lambda av: '{{"id": {}, "average_steps": {}}}'.format(av[0],av[1])) | "avg6" >> beam.Map(lambda po: bytes(po, "utf-8")) | "avg7" >> beam.io.WriteToPubSub(topic="projects/data228/topics/data228-hw8-out") ) DataflowRunner().run_pipeline(pipeline, options=options)