def expand(self, p): if self._level <= 2: return p | beam.Create([1]) else: a = p | 'A' >> beam.ExternalTransform( 'fib', str(self._level - 1).encode('ascii'), expansion_service.ExpansionServiceServicer()) b = p | 'B' >> beam.ExternalTransform( 'fib', str(self._level - 2).encode('ascii'), expansion_service.ExpansionServiceServicer()) return ((a, b) | beam.Flatten() | beam.CombineGlobally(sum).without_defaults())
def run(pipeline_args, input_file, output_file): # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(input_file) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(bytes)) | 'count' >> beam.ExternalTransform('pytest:beam:transforms:count', None, EXPANSION_SERVICE_ADDR)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(output_file) result = p.run() result.wait_until_finish()
def test_pipeline_generation(self): @ptransform.PTransform.register_urn('simple', None) class SimpleTransform(ptransform.PTransform): def expand(self, pcoll): return pcoll | 'TestLabel' >> beam.Map( lambda x: 'Simple(%s)' % x) def to_runner_api_parameter(self, unused_context): return 'simple', None @staticmethod def from_runner_api_parameter(unused_parameter, unused_context): return SimpleTransform() pipeline = beam.Pipeline() res = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)'])) proto, _ = pipeline.to_runner_api(return_context=True) pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner, pipeline._options) # Original pipeline has the un-expanded external transform self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts) # new pipeline has the expanded external transform self.assertNotEqual( [], pipeline_from_proto.transforms_stack[0].parts[1].parts) self.assertEqual( u'ExternalTransform(simple)/TestLabel', pipeline_from_proto. transforms_stack[0].parts[1].parts[0].full_label)
def run_combine_globally(self, pipeline): with pipeline as p: res = (p | beam.Create([1, 2, 3]).with_output_types(int) | beam.ExternalTransform(TEST_COMGL_URN, None, self.expansion_service)) assert_that(res, equal_to([6]))
def test_external_empty_spec_translation(self): pipeline = beam.Pipeline() external_transform = beam.ExternalTransform( 'beam:transforms:xlang:test:prefix', ImplicitSchemaPayloadBuilder({'data': u'0'}), expansion_service.ExpansionServiceServicer()) _ = (pipeline | beam.Create(['a', 'b']) | external_transform) pipeline.run().wait_until_finish() external_transform_label = ( 'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel') for transform in external_transform._expanded_components.transforms.values( ): # We clear the spec of one of the external transforms. if transform.unique_name == external_transform_label: transform.spec.Clear() context = pipeline_context.PipelineContext() proto_pipeline = pipeline.to_runner_api(context=context) proto_transform = None for transform in proto_pipeline.components.transforms.values(): if (transform.unique_name == 'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel' ): proto_transform = transform self.assertIsNotNone(proto_transform) self.assertTrue(str(proto_transform).strip().find('spec {') == -1)
def test_xlang_parquetio_write(self): expansion_jar = os.environ.get('EXPANSION_JAR') port = os.environ.get('EXPANSION_PORT') address = 'localhost:%s' % port try: with TestPipeline() as p: p.get_pipeline_options().view_as( DebugOptions).experiments.append('jar_packages=' + expansion_jar) p.not_use_test_runner_api = True _ = p \ | beam.Create([ AvroRecord({"name": "abc"}), AvroRecord({"name": "def"}), AvroRecord({"name": "ghi"})]) \ | beam.ExternalTransform( PARQUET_WRITE_URN, ImplicitSchemaPayloadBuilder({'data': u'/tmp/test.parquet'}), address) except RuntimeError as e: if re.search(PARQUET_WRITE_URN, str(e)): print( "looks like URN not implemented in expansion service, skipping." ) else: raise e
def test_multi(self): @ptransform.PTransform.register_urn('multi', None) class MutltiTransform(ptransform.PTransform): def expand(self, pcolls): return { 'main': (pcolls['main1'], pcolls['main2']) | beam.Flatten() | beam.Map(lambda x, s: x + s, beam.pvalue.AsSingleton(pcolls['side'])), 'side': pcolls['side'] | beam.Map(lambda x: x + x), } def to_runner_api_parameter(self, unused_context): return 'multi', None @staticmethod def from_runner_api_parameter(unused_parameter, unused_context): return MutltiTransform() with beam.Pipeline() as p: main1 = p | 'Main1' >> beam.Create(['a', 'bb'], reshuffle=False) main2 = p | 'Main2' >> beam.Create(['x', 'yy', 'zzz'], reshuffle=False) side = p | 'Side' >> beam.Create(['s']) res = dict(main1=main1, main2=main2, side=side) | beam.ExternalTransform( 'multi', None, expansion_service.ExpansionServiceServicer()) assert_that(res['main'], equal_to(['as', 'bbs', 'xs', 'yys', 'zzzs'])) assert_that(res['side'], equal_to(['ss']), label='CheckSide')
def test_job_python_from_python_it(self): @ptransform.PTransform.register_urn('simple', None) class SimpleTransform(ptransform.PTransform): def expand(self, pcoll): return pcoll | beam.Map(lambda x: 'Simple(%s)' % x) def to_runner_api_parameter(self, unused_context): return 'simple', None @staticmethod def from_runner_api_parameter(_0, _1, _2): return SimpleTransform() pipeline = TestPipeline(is_integration_test=True) res = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)'])) proto_pipeline, _ = pipeline.to_runner_api(return_context=True) pipeline_from_proto = Pipeline.from_runner_api(proto_pipeline, pipeline.runner, pipeline._options) pipeline_from_proto.run().wait_until_finish()
def test_payload(self): with beam.Pipeline() as p: res = (p | beam.Create(['a', 'bb'], reshuffle=False) | beam.ExternalTransform( 'payload', b's', expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['as', 'bbs']))
def run_flatten(self, pipeline): with pipeline as p: col1 = p | 'col1' >> beam.Create([1, 2, 3]).with_output_types(int) col2 = p | 'col2' >> beam.Create([4, 5, 6]).with_output_types(int) res = ((col1, col2) | beam.ExternalTransform(TEST_FLATTEN_URN, None, self.expansion_service)) assert_that(res, equal_to([1, 2, 3, 4, 5, 6]))
def test_simple(self): with beam.Pipeline() as p: res = (p | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)']))
def run_partition(self, pipeline): with pipeline as p: res = (p | beam.Create([1, 2, 3, 4, 5, 6]).with_output_types(int) | beam.ExternalTransform(TEST_PARTITION_URN, None, self.expansion_service)) assert_that(res['0'], equal_to([2, 4, 6]), label='check_even') assert_that(res['1'], equal_to([1, 3, 5]), label='check_odd')
def test_java_expansion(self): if not self.expansion_service_jar: raise unittest.SkipTest('No expansion service jar provided.') # The actual definitions of these transforms is in # org.apache.beam.runners.core.construction.TestExpansionService. TEST_COUNT_URN = "pytest:beam:transforms:count" TEST_FILTER_URN = "pytest:beam:transforms:filter_less_than" # Run as cheaply as possible on the portable runner. # TODO(robertwb): Support this directly in the direct runner. options = beam.options.pipeline_options.PipelineOptions( runner='PortableRunner', experiments=['beam_fn_api'], environment_type=python_urns.EMBEDDED_PYTHON, job_endpoint='embed') try: # Start the java server and wait for it to be ready. port = '8091' address = 'localhost:%s' % port server = subprocess.Popen( ['java', '-jar', self.expansion_service_jar, port]) with grpc.insecure_channel(address) as channel: grpc.channel_ready_future(channel).result() # Run a simple count-filtered-letters pipeline. with beam.Pipeline(options=options) as p: res = ( p | beam.Create(list('aaabccxyyzzz')) | beam.Map(unicode) # TODO(BEAM-6587): Use strings directly rather than ints. | beam.Map(lambda x: int(ord(x))) | beam.ExternalTransform(TEST_FILTER_URN, b'middle', address) | beam.ExternalTransform(TEST_COUNT_URN, None, address) # TODO(BEAM-6587): Remove when above is removed. | beam.Map(lambda kv: (chr(kv[0]), kv[1])) | beam.Map(lambda kv: '%s: %s' % kv)) assert_that(res, equal_to(['a: 3', 'b: 1', 'c: 2'])) finally: server.kill()
def run_prefix(self, pipeline): with pipeline as p: res = (p | beam.Create(['a', 'b']).with_output_types(unicode) | beam.ExternalTransform( TEST_PREFIX_URN, ImplicitSchemaPayloadBuilder({'data': u'0'}), self.expansion_service)) assert_that(res, equal_to(['0a', '0b']))
def test_multi(self): with beam.Pipeline() as p: main1 = p | 'Main1' >> beam.Create(['a', 'bb'], reshuffle=False) main2 = p | 'Main2' >> beam.Create(['x', 'yy', 'zzz'], reshuffle=False) side = p | 'Side' >> beam.Create(['s']) res = dict(main1=main1, main2=main2, side=side) | beam.ExternalTransform( 'multi', None, expansion_service.ExpansionServiceServicer()) assert_that(res['main'], equal_to(['as', 'bbs', 'xs', 'yys', 'zzzs'])) assert_that(res['side'], equal_to(['ss']), label='CheckSide')
def run_combine_per_key(self, pipeline): with pipeline as p: res = (p | beam.Create([ ('a', 1), ('a', 2), ('b', 3) ]).with_output_types(typing.Tuple[unicode, int]) | beam.ExternalTransform(TEST_COMPK_URN, None, self.expansion_service)) assert_that(res, equal_to([('a', 3), ('b', 3)]))
def run_pipelines(pipeline_options): # The actual definitions of these transforms is in # org.apache.beam.runners.core.construction.TestExpansionService. TEST_COUNT_URN = "pytest:beam:transforms:count" TEST_FILTER_URN = "pytest:beam:transforms:filter_less_than" assert (pipeline_options.view_as(StandardOptions).runner.lower() == "portablerunner"), "Only PortableRunner is supported." try: # Run a simple count-filtered-letters pipeline. p = beam.Pipeline(options=pipeline_options) p.runner.init_dockerized_job_server() # Start the java server and wait for it to be ready. port = str(ExternalTransformTest.expansion_service_port) address = 'localhost:%s' % port server = subprocess.Popen([ 'java', '-jar', ExternalTransformTest.expansion_service_jar, port ]) with grpc.insecure_channel(address) as channel: grpc.channel_ready_future(channel).result() res = ( p | beam.Create(list('aaabccxyyzzz')) | beam.Map(unicode) # TODO(BEAM-6587): Use strings directly rather than ints. | beam.Map(lambda x: int(ord(x))) | beam.ExternalTransform(TEST_FILTER_URN, b'middle', address) | beam.ExternalTransform(TEST_COUNT_URN, None, address) # # TODO(BEAM-6587): Remove when above is removed. | beam.Map(lambda kv: (chr(kv[0]), kv[1])) | beam.Map(lambda kv: '%s: %s' % kv)) assert_that(res, equal_to(['a: 3', 'b: 1', 'c: 2'])) p.run().wait_until_finish() finally: server.kill()
def run_group_by_key(self, pipeline): with pipeline as p: res = ( p | beam.Create([(0, "1"), (0, "2"), (1, "3")], reshuffle=False).with_output_types( typing.Tuple[int, unicode]) | beam.ExternalTransform(TEST_GBK_URN, None, self.expansion_service) | beam.Map(lambda x: "{}:{}".format(x[0], ','.join(sorted(x[1]))))) assert_that(res, equal_to(['0:1,2', '1:3']))
def test_external_transform_finder_leaf(self): pipeline = beam.Pipeline() _ = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'beam:transforms:xlang:test:nooutput', ImplicitSchemaPayloadBuilder({'data': u'0'}), expansion_service.ExpansionServiceServicer())) pipeline.run().wait_until_finish() self.assertTrue(pipeline.contains_external_transforms)
def run_multi_input_output_with_sideinput(self, pipeline): with pipeline as p: main1 = p | 'Main1' >> beam.Create( ['a', 'bb'], reshuffle=False).with_output_types(unicode) main2 = p | 'Main2' >> beam.Create( ['x', 'yy', 'zzz'], reshuffle=False).with_output_types(unicode) side = p | 'Side' >> beam.Create(['s']).with_output_types(unicode) res = dict( main1=main1, main2=main2, side=side) | beam.ExternalTransform( TEST_MULTI_URN, None, self.expansion_service) assert_that(res['main'], equal_to(['as', 'bbs', 'xs', 'yys', 'zzzs'])) assert_that(res['side'], equal_to(['ss']), label='CheckSide')
def test_combine_globally(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: res = (p | beam.Create([1, 2, 3]).with_output_types(int) | beam.ExternalTransform( TEST_COMGL_URN, None, ValidateRunnerXlangTest.expansion_service)) assert_that(res, equal_to([6]))
def run_cogroup_by_key(self, pipeline): with pipeline as p: col1 = p | 'create_col1' >> beam.Create( [(0, "1"), (0, "2"), (1, "3")], reshuffle=False).with_output_types( typing.Tuple[int, unicode]) col2 = p | 'create_col2' >> beam.Create( [(0, "4"), (1, "5"), (1, "6")], reshuffle=False).with_output_types( typing.Tuple[int, unicode]) res = ( dict(col1=col1, col2=col2) | beam.ExternalTransform(TEST_CGBK_URN, None, self.expansion_service) | beam.Map(lambda x: "{}:{}".format(x[0], ','.join(sorted(x[1]))))) assert_that(res, equal_to(['0:1,2,4', '1:3,5,6']))
def test_flatten(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: col1 = p | 'col1' >> beam.Create([1, 2, 3]).with_output_types(int) col2 = p | 'col2' >> beam.Create([4, 5, 6]).with_output_types(int) res = ((col1, col2) | beam.ExternalTransform( TEST_FLATTEN_URN, None, ValidateRunnerXlangTest.expansion_service)) assert_that(res, equal_to([1, 2, 3, 4, 5, 6]))
def test_partition(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: res = (p | beam.Create([1, 2, 3, 4, 5, 6]).with_output_types(int) | beam.ExternalTransform( TEST_PARTITION_URN, None, ValidateRunnerXlangTest.expansion_service)) assert_that(res['0'], equal_to([2, 4, 6]), label='check_even') assert_that(res['1'], equal_to([1, 3, 5]), label='check_odd')
def test_prefix(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: res = (p | beam.Create(['a', 'b']).with_output_types(unicode) | beam.ExternalTransform( TEST_PREFIX_URN, ImplicitSchemaPayloadBuilder({'data': u'0'}), ValidateRunnerXlangTest.expansion_service)) assert_that(res, equal_to(['0a', '0b']))
def test_combine_per_key(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: res = ( p | beam.Create([('a', 1), ('a', 2), ('b', 3)]).with_output_types( typing.Tuple[unicode, int]) | beam.ExternalTransform( TEST_COMPK_URN, None, ValidateRunnerXlangTest.expansion_service)) assert_that(res, equal_to([('a', 3), ('b', 3)]))
def test_as_external_transform_no_kwargs(self): with FullyQualifiedNamedTransform.with_filter('*'): with beam.Pipeline() as p: assert_that( p | beam.Create(['a', 'b', 'c']) | beam.ExternalTransform( PYTHON_FULLY_QUALIFIED_NAMED_TRANSFORM_URN, ImplicitSchemaPayloadBuilder({ 'constructor': 'apache_beam.transforms' '.fully_qualified_named_transform_test._TestTransform', 'args': beam.Row(arg0='x', arg1='y'), }), expansion_service.ExpansionServiceServicer()), equal_to(['xay', 'xby', 'xcy']))
def run_flatten(self, pipeline): """ Target transform - Flatten (https://beam.apache.org/documentation/programming-guide/#flatten) Test scenario - Merging multiple collections into a single collection Boundary conditions checked - - PCollectionList<?> to external transforms - PCollection<?> from external transforms """ with pipeline as p: col1 = p | 'col1' >> beam.Create([1, 2, 3]).with_output_types(int) col2 = p | 'col2' >> beam.Create([4, 5, 6]).with_output_types(int) res = ((col1, col2) | beam.ExternalTransform( TEST_FLATTEN_URN, None, self.expansion_service)) assert_that(res, equal_to([1, 2, 3, 4, 5, 6]))
def test_no_output_coder(self): external_transform = beam.ExternalTransform( 'map_to_union_types', None, expansion_service.ExpansionServiceServicer()) with beam.Pipeline() as p: res = (p | beam.Create([2, 2], reshuffle=False) | external_transform) assert_that(res, equal_to([2, 2])) context = pipeline_context.PipelineContext( external_transform._expanded_components) self.assertEqual(len(external_transform._expanded_transform.outputs), 1) for _, pcol_id in external_transform._expanded_transform.outputs.items( ): pcol = context.pcollections.get_by_id(pcol_id) self.assertEqual(pcol.element_type, typehints.Any)
def run_combine_globally(self, pipeline): """ Target transform - Combine (https://beam.apache.org/documentation/programming-guide/#combine) Test scenario - Combining elements globally with a predefined simple CombineFn Boundary conditions checked - - PCollection<?> to external transforms - PCollection<?> from external transforms """ with pipeline as p: res = (p | beam.Create([1, 2, 3]).with_output_types(int) | beam.ExternalTransform(TEST_COMGL_URN, None, self.expansion_service)) assert_that(res, equal_to([6]))