def expand(self, p): if self._level <= 2: return p | beam.Create([1]) else: a = p | 'A' >> beam.ExternalTransform( 'fib', str(self._level - 1).encode('ascii'), expansion_service.ExpansionServiceServicer()) b = p | 'B' >> beam.ExternalTransform( 'fib', str(self._level - 2).encode('ascii'), expansion_service.ExpansionServiceServicer()) return ( (a, b) | beam.Flatten() | beam.CombineGlobally(sum).without_defaults())
def test_external_empty_spec_translation(self): pipeline = beam.Pipeline() external_transform = beam.ExternalTransform( 'beam:transforms:xlang:test:prefix', ImplicitSchemaPayloadBuilder({'data': u'0'}), expansion_service.ExpansionServiceServicer()) _ = (pipeline | beam.Create(['a', 'b']) | external_transform) pipeline.run().wait_until_finish() external_transform_label = ( 'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel') for transform in external_transform._expanded_components.transforms.values( ): # We clear the spec of one of the external transforms. if transform.unique_name == external_transform_label: transform.spec.Clear() context = pipeline_context.PipelineContext() proto_pipeline = pipeline.to_runner_api(context=context) proto_transform = None for transform in proto_pipeline.components.transforms.values(): if (transform.unique_name == 'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel' ): proto_transform = transform self.assertIsNotNone(proto_transform) self.assertTrue(str(proto_transform).strip().find('spec {') == -1)
def test_job_python_from_python_it(self): @ptransform.PTransform.register_urn('simple', None) class SimpleTransform(ptransform.PTransform): def expand(self, pcoll): return pcoll | beam.Map(lambda x: 'Simple(%s)' % x) def to_runner_api_parameter(self, unused_context): return 'simple', None @staticmethod def from_runner_api_parameter(_0, _1, _2): return SimpleTransform() pipeline = TestPipeline(is_integration_test=True) res = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)'])) proto_pipeline, _ = pipeline.to_runner_api(return_context=True) pipeline_from_proto = Pipeline.from_runner_api(proto_pipeline, pipeline.runner, pipeline._options) pipeline_from_proto.run().wait_until_finish()
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, help='port on which to serve the job api') parser.add_argument('--fully_qualified_name_glob', default=None) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions( pipeline_args + ["--experiments=beam_fn_api", "--sdk_location=container"]) with fully_qualified_named_transform.FullyQualifiedNamedTransform.with_filter( known_args.fully_qualified_name_glob): server = grpc.server(thread_pool_executor.shared_unbounded_instance()) beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer(pipeline_options), server) beam_artifact_api_pb2_grpc.add_ArtifactRetrievalServiceServicer_to_server( artifact_service.ArtifactRetrievalService( artifact_service.BeamFilesystemHandler(None).file_reader), server) server.add_insecure_port('localhost:{}'.format(known_args.port)) server.start() _LOGGER.info('Listening for expansion requests at %d', known_args.port) def cleanup(unused_signum, unused_frame): _LOGGER.info('Shutting down expansion service.') server.stop(None) signal.signal(signal.SIGTERM, cleanup) signal.signal(signal.SIGINT, cleanup) # blocking main thread forever. signal.pause()
def test_multi(self): @ptransform.PTransform.register_urn('multi', None) class MutltiTransform(ptransform.PTransform): def expand(self, pcolls): return { 'main': (pcolls['main1'], pcolls['main2']) | beam.Flatten() | beam.Map(lambda x, s: x + s, beam.pvalue.AsSingleton(pcolls['side'])), 'side': pcolls['side'] | beam.Map(lambda x: x + x), } def to_runner_api_parameter(self, unused_context): return 'multi', None @staticmethod def from_runner_api_parameter(unused_parameter, unused_context): return MutltiTransform() with beam.Pipeline() as p: main1 = p | 'Main1' >> beam.Create(['a', 'bb'], reshuffle=False) main2 = p | 'Main2' >> beam.Create(['x', 'yy', 'zzz'], reshuffle=False) side = p | 'Side' >> beam.Create(['s']) res = dict(main1=main1, main2=main2, side=side) | beam.ExternalTransform( 'multi', None, expansion_service.ExpansionServiceServicer()) assert_that(res['main'], equal_to(['as', 'bbs', 'xs', 'yys', 'zzzs'])) assert_that(res['side'], equal_to(['ss']), label='CheckSide')
def __init__(self, transform, pkgs=DEFAULT_PKGS, container=DEFAULT_CONTAINER_NAME, *args, **kwargs): transform_urn = '{}.{}'.format(transform.__module__, transform.__name__) beam.PTransform.register_urn(transform_urn, bytes, constructor=functools.partial( self.constructor, transform)) payload = pickle.dumps((args, kwargs)) options = beam.pipeline.PipelineOptions( environment_type='DOCKER', # environment_config=( # '-v /tmp/rillbeam/multiexternal ' # '-e DOCKER_ENTRYPOINT_SETPKG="{}" ' # '{}'.format(':'.join(pkgs), container)), environment_config=container, ) # from apache_beam.portability import python_urns # options = beam.pipeline.PipelineOptions( # environment_type=python_urns.SUBPROCESS_SDK, # environment_config=( # b'{} -m apache_beam.runners.worker.sdk_worker_main'.format( # sys.executable.encode('ascii'))) # ) endpoint = expansion_service.ExpansionServiceServicer(options=options) super(EnvTransform, self).__init__(transform_urn, payload, endpoint)
def test_pipeline_generation(self): @ptransform.PTransform.register_urn('simple', None) class SimpleTransform(ptransform.PTransform): def expand(self, pcoll): return pcoll | 'TestLabel' >> beam.Map( lambda x: 'Simple(%s)' % x) def to_runner_api_parameter(self, unused_context): return 'simple', None @staticmethod def from_runner_api_parameter(unused_parameter, unused_context): return SimpleTransform() pipeline = beam.Pipeline() res = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)'])) proto, _ = pipeline.to_runner_api(return_context=True) pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner, pipeline._options) # Original pipeline has the un-expanded external transform self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts) # new pipeline has the expanded external transform self.assertNotEqual( [], pipeline_from_proto.transforms_stack[0].parts[1].parts) self.assertEqual( u'ExternalTransform(simple)/TestLabel', pipeline_from_proto. transforms_stack[0].parts[1].parts[0].full_label)
def main(unused_argv): PyPIArtifactRegistry.register_artifact('beautifulsoup4', '>=4.9,<5.0') parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, help='port on which to serve the job api') parser.add_argument('--fully_qualified_name_glob', default=None) options = parser.parse_args() global server with fully_qualified_named_transform.FullyQualifiedNamedTransform.with_filter( options.fully_qualified_name_glob): server = grpc.server(thread_pool_executor.shared_unbounded_instance()) beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer( PipelineOptions([ "--experiments", "beam_fn_api", "--sdk_location", "container" ])), server) beam_artifact_api_pb2_grpc.add_ArtifactRetrievalServiceServicer_to_server( artifact_service.ArtifactRetrievalService( artifact_service.BeamFilesystemHandler(None).file_reader), server) server.add_insecure_port('localhost:{}'.format(options.port)) server.start() _LOGGER.info('Listening for expansion requests at %d', options.port) signal.signal(signal.SIGTERM, cleanup) signal.signal(signal.SIGINT, cleanup) # blocking main thread forever. signal.pause()
def __init__(self, transform, envvars, *args, **kwargs): urn = '{}.{}'.format(transform.__module__, transform.__name__) if urn not in beam.PTransform._known_urns: beam.PTransform.register_urn(urn, bytes, constructor=functools.partial( self.constructor, transform)) # options = beam.pipeline.PipelineOptions( # environment_type='DOCKER', # environment_config=( # '{env} ' # '{container}'.format( # env=' '.join('-e {}={}'.format(k, v) # for k, v in envvars.items()), # container='localhost:5000/beam/python:latest', # )), # ) import sys from apache_beam.portability import python_urns env = ' '.join('{}={}'.format(k, v) for k, v in envvars.items()) options = beam.pipeline.PipelineOptions( environment_type=python_urns.SUBPROCESS_SDK, environment_config=( b'{} {} -m apache_beam.runners.worker.sdk_worker_main'.format( env, sys.executable.encode('ascii')))) payload = pickle.dumps((args, kwargs)) endpoint = expansion_service.ExpansionServiceServicer(options=options) super(EnvTransform, self).__init__(urn, payload, endpoint)
def test_simple(self): with beam.Pipeline() as p: res = (p | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)']))
def test_payload(self): with beam.Pipeline() as p: res = (p | beam.Create(['a', 'bb'], reshuffle=False) | beam.ExternalTransform( 'payload', b's', expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['as', 'bbs']))
def test_multi(self): with beam.Pipeline() as p: main1 = p | 'Main1' >> beam.Create(['a', 'bb'], reshuffle=False) main2 = p | 'Main2' >> beam.Create(['x', 'yy', 'zzz'], reshuffle=False) side = p | 'Side' >> beam.Create(['s']) res = dict(main1=main1, main2=main2, side=side) | beam.ExternalTransform( 'multi', None, expansion_service.ExpansionServiceServicer()) assert_that(res['main'], equal_to(['as', 'bbs', 'xs', 'yys', 'zzzs'])) assert_that(res['side'], equal_to(['ss']), label='CheckSide')
def test_external_transform_finder_leaf(self): pipeline = beam.Pipeline() _ = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'beam:transforms:xlang:test:nooutput', ImplicitSchemaPayloadBuilder({'data': u'0'}), expansion_service.ExpansionServiceServicer())) pipeline.run().wait_until_finish() self.assertTrue(pipeline.contains_external_transforms)
def test_as_external_transform_no_kwargs(self): with FullyQualifiedNamedTransform.with_filter('*'): with beam.Pipeline() as p: assert_that( p | beam.Create(['a', 'b', 'c']) | beam.ExternalTransform( PYTHON_FULLY_QUALIFIED_NAMED_TRANSFORM_URN, ImplicitSchemaPayloadBuilder({ 'constructor': 'apache_beam.transforms' '.fully_qualified_named_transform_test._TestTransform', 'args': beam.Row(arg0='x', arg1='y'), }), expansion_service.ExpansionServiceServicer()), equal_to(['xay', 'xby', 'xcy']))
def test_no_output_coder(self): external_transform = beam.ExternalTransform( 'map_to_union_types', None, expansion_service.ExpansionServiceServicer()) with beam.Pipeline() as p: res = (p | beam.Create([2, 2], reshuffle=False) | external_transform) assert_that(res, equal_to([2, 2])) context = pipeline_context.PipelineContext( external_transform._expanded_components) self.assertEqual(len(external_transform._expanded_transform.outputs), 1) for _, pcol_id in external_transform._expanded_transform.outputs.items( ): pcol = context.pcollections.get_by_id(pcol_id) self.assertEqual(pcol.element_type, typehints.Any)
def main(unused_argv): parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, help='port on which to serve the job api') options = parser.parse_args() global server server = grpc.server(futures.ThreadPoolExecutor(max_workers=2)) beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer(PipelineOptions()), server) server.add_insecure_port('localhost:{}'.format(options.port)) server.start() logging.info('Listening for expansion requests at %d', options.port) # blocking main thread forever. signal.pause()
def test_pipeline_generation_with_runner_overrides(self): pipeline_properties = [ '--dataflow_endpoint=ignored', '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--no_auth', '--dry_run=True', '--sdk_location=container', '--runner=DataflowRunner', '--streaming' ] with beam.Pipeline(options=PipelineOptions(pipeline_properties)) as p: _ = ( p | beam.io.ReadFromPubSub( subscription= 'projects/dummy-project/subscriptions/dummy-subscription') | beam.ExternalTransform( 'beam:transforms:xlang:test:prefix', ImplicitSchemaPayloadBuilder({'data': u'0'}), expansion_service.ExpansionServiceServicer())) pipeline_proto, _ = p.to_runner_api(return_context=True) pubsub_read_transform = None external_transform = None proto_transforms = pipeline_proto.components.transforms for id in proto_transforms: if 'beam:transforms:xlang:test:prefix' in proto_transforms[ id].unique_name: external_transform = proto_transforms[id] if 'ReadFromPubSub' in proto_transforms[id].unique_name: pubsub_read_transform = proto_transforms[id] if not (pubsub_read_transform and external_transform): raise ValueError( 'Could not find an external transform and the PubSub read transform ' 'in the pipeline') self.assertEqual(1, len(list(pubsub_read_transform.outputs.values()))) self.assertEqual( list(pubsub_read_transform.outputs.values()), list(external_transform.inputs.values()))
def run(port=8197, options=None): global server if options is None: import argparse parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args() options = PipelineOptions(pipeline_args) server = grpc.server(futures.ThreadPoolExecutor(max_workers=2)) beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer(options), server) url = 'localhost:{}'.format(port) server.add_insecure_port(url) server.start() print('Listening for expansion requests at {!r}'.format(url)) # blocking main thread forever. signal.pause()
def main(unused_argv): parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, help='port on which to serve the job api') options = parser.parse_args() global server server = grpc.server(UnboundedThreadPoolExecutor()) beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer( PipelineOptions(["--experiments", "beam_fn_api"])), server) server.add_insecure_port('localhost:{}'.format(options.port)) server.start() _LOGGER.info('Listening for expansion requests at %d', options.port) signal.signal(signal.SIGTERM, cleanup) signal.signal(signal.SIGINT, cleanup) # blocking main thread forever. signal.pause()
def test_simple(self): @ptransform.PTransform.register_urn('simple', None) class SimpleTransform(ptransform.PTransform): def expand(self, pcoll): return pcoll | beam.Map(lambda x: 'Simple(%s)' % x) def to_runner_api_parameter(self, unused_context): return 'simple', None @staticmethod def from_runner_api_parameter(unused_parameter, unused_context): return SimpleTransform() with beam.Pipeline() as p: res = (p | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)']))
def test_pipeline_generation(self): pipeline = beam.Pipeline() res = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)'])) proto, _ = pipeline.to_runner_api(return_context=True) pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner, pipeline._options) # Original pipeline has the un-expanded external transform self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts) # new pipeline has the expanded external transform self.assertNotEqual( [], pipeline_from_proto.transforms_stack[0].parts[1].parts) self.assertEqual( u'ExternalTransform(simple)/TestLabel', pipeline_from_proto. transforms_stack[0].parts[1].parts[0].full_label)
def test_payload(self): @ptransform.PTransform.register_urn('payload', bytes) class PayloadTransform(ptransform.PTransform): def __init__(self, payload): self._payload = payload def expand(self, pcoll): return pcoll | beam.Map(lambda x, s: x + s, self._payload) def to_runner_api_parameter(self, unused_context): return b'payload', self._payload.encode('ascii') @staticmethod def from_runner_api_parameter(payload, unused_context): return PayloadTransform(payload.decode('ascii')) with beam.Pipeline() as p: res = (p | beam.Create(['a', 'bb'], reshuffle=False) | beam.ExternalTransform( 'payload', b's', expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['as', 'bbs']))
def test_pipeline_generation(self): pipeline = beam.Pipeline() _ = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'beam:transforms:xlang:test:prefix', ImplicitSchemaPayloadBuilder({'data': u'0'}), expansion_service.ExpansionServiceServicer())) proto, _ = pipeline.to_runner_api(return_context=True) pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner, pipeline._options) # Original pipeline has the un-expanded external transform self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts) # new pipeline has the expanded external transform self.assertNotEqual( [], pipeline_from_proto.transforms_stack[0].parts[1].parts) self.assertEqual( u'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel', pipeline_from_proto.transforms_stack[0].parts[1].parts[0]. full_label)
def main(unused_argv): parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, help='port on which to serve the job api') options = parser.parse_args() global server server = grpc.server(UnboundedThreadPoolExecutor()) # DOCKER SDK Harness beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer( PipelineOptions([ "--experiments", "beam_fn_api", "--sdk_location", "container" ])), server) # PROCESS SDK Harness # beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( # expansion_service.ExpansionServiceServicer( # PipelineOptions.from_dictionary({ # 'environment_type': 'PROCESS', # 'environment_config': '{"command": "sdks/python/container/build/target/launcher/darwin_amd64/boot"}', # 'experiments': 'beam_fn_api', # 'sdk_location': 'container', # }) # ), server # ) server.add_insecure_port('localhost:{}'.format(options.port)) server.start() _LOGGER.info('Listening for expansion requests at %d', options.port) signal.signal(signal.SIGTERM, cleanup) signal.signal(signal.SIGINT, cleanup) # blocking main thread forever. signal.pause()