def expand(self, p):
     if self._level <= 2:
         return p | beam.Create([1])
     else:
         a = p | 'A' >> beam.ExternalTransform(
             'fib',
             str(self._level - 1).encode('ascii'),
             expansion_service.ExpansionServiceServicer())
         b = p | 'B' >> beam.ExternalTransform(
             'fib',
             str(self._level - 2).encode('ascii'),
             expansion_service.ExpansionServiceServicer())
         return ((a, b)
                 | beam.Flatten()
                 | beam.CombineGlobally(sum).without_defaults())
示例#2
0
def run(pipeline_args, input_file, output_file):

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(input_file)

    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(bytes))
              |
              'count' >> beam.ExternalTransform('pytest:beam:transforms:count',
                                                None, EXPANSION_SERVICE_ADDR))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(output_file)

    result = p.run()
    result.wait_until_finish()
示例#3
0
    def test_pipeline_generation(self):
        @ptransform.PTransform.register_urn('simple', None)
        class SimpleTransform(ptransform.PTransform):
            def expand(self, pcoll):
                return pcoll | 'TestLabel' >> beam.Map(
                    lambda x: 'Simple(%s)' % x)

            def to_runner_api_parameter(self, unused_context):
                return 'simple', None

            @staticmethod
            def from_runner_api_parameter(unused_parameter, unused_context):
                return SimpleTransform()

        pipeline = beam.Pipeline()
        res = (pipeline
               | beam.Create(['a', 'b'])
               | beam.ExternalTransform(
                   'simple', None,
                   expansion_service.ExpansionServiceServicer()))
        assert_that(res, equal_to(['Simple(a)', 'Simple(b)']))

        proto, _ = pipeline.to_runner_api(return_context=True)
        pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner,
                                                       pipeline._options)

        # Original pipeline has the un-expanded external transform
        self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts)

        # new pipeline has the expanded external transform
        self.assertNotEqual(
            [], pipeline_from_proto.transforms_stack[0].parts[1].parts)
        self.assertEqual(
            u'ExternalTransform(simple)/TestLabel', pipeline_from_proto.
            transforms_stack[0].parts[1].parts[0].full_label)
示例#4
0
 def run_combine_globally(self, pipeline):
     with pipeline as p:
         res = (p
                | beam.Create([1, 2, 3]).with_output_types(int)
                | beam.ExternalTransform(TEST_COMGL_URN, None,
                                         self.expansion_service))
         assert_that(res, equal_to([6]))
示例#5
0
    def test_external_empty_spec_translation(self):
        pipeline = beam.Pipeline()
        external_transform = beam.ExternalTransform(
            'beam:transforms:xlang:test:prefix',
            ImplicitSchemaPayloadBuilder({'data': u'0'}),
            expansion_service.ExpansionServiceServicer())
        _ = (pipeline | beam.Create(['a', 'b']) | external_transform)
        pipeline.run().wait_until_finish()

        external_transform_label = (
            'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel')
        for transform in external_transform._expanded_components.transforms.values(
        ):
            # We clear the spec of one of the external transforms.
            if transform.unique_name == external_transform_label:
                transform.spec.Clear()

        context = pipeline_context.PipelineContext()
        proto_pipeline = pipeline.to_runner_api(context=context)

        proto_transform = None
        for transform in proto_pipeline.components.transforms.values():
            if (transform.unique_name ==
                    'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel'
                ):
                proto_transform = transform

        self.assertIsNotNone(proto_transform)
        self.assertTrue(str(proto_transform).strip().find('spec {') == -1)
 def test_xlang_parquetio_write(self):
     expansion_jar = os.environ.get('EXPANSION_JAR')
     port = os.environ.get('EXPANSION_PORT')
     address = 'localhost:%s' % port
     try:
         with TestPipeline() as p:
             p.get_pipeline_options().view_as(
                 DebugOptions).experiments.append('jar_packages=' +
                                                  expansion_jar)
             p.not_use_test_runner_api = True
             _ = p \
               | beam.Create([
                   AvroRecord({"name": "abc"}), AvroRecord({"name": "def"}),
                   AvroRecord({"name": "ghi"})]) \
               | beam.ExternalTransform(
                   PARQUET_WRITE_URN,
                   ImplicitSchemaPayloadBuilder({'data': u'/tmp/test.parquet'}),
                   address)
     except RuntimeError as e:
         if re.search(PARQUET_WRITE_URN, str(e)):
             print(
                 "looks like URN not implemented in expansion service, skipping."
             )
         else:
             raise e
示例#7
0
    def test_multi(self):
        @ptransform.PTransform.register_urn('multi', None)
        class MutltiTransform(ptransform.PTransform):
            def expand(self, pcolls):
                return {
                    'main': (pcolls['main1'], pcolls['main2'])
                    | beam.Flatten()
                    | beam.Map(lambda x, s: x + s,
                               beam.pvalue.AsSingleton(pcolls['side'])),
                    'side':
                    pcolls['side'] | beam.Map(lambda x: x + x),
                }

            def to_runner_api_parameter(self, unused_context):
                return 'multi', None

            @staticmethod
            def from_runner_api_parameter(unused_parameter, unused_context):
                return MutltiTransform()

        with beam.Pipeline() as p:
            main1 = p | 'Main1' >> beam.Create(['a', 'bb'], reshuffle=False)
            main2 = p | 'Main2' >> beam.Create(['x', 'yy', 'zzz'],
                                               reshuffle=False)
            side = p | 'Side' >> beam.Create(['s'])
            res = dict(main1=main1, main2=main2,
                       side=side) | beam.ExternalTransform(
                           'multi', None,
                           expansion_service.ExpansionServiceServicer())
            assert_that(res['main'],
                        equal_to(['as', 'bbs', 'xs', 'yys', 'zzzs']))
            assert_that(res['side'], equal_to(['ss']), label='CheckSide')
    def test_job_python_from_python_it(self):
        @ptransform.PTransform.register_urn('simple', None)
        class SimpleTransform(ptransform.PTransform):
            def expand(self, pcoll):
                return pcoll | beam.Map(lambda x: 'Simple(%s)' % x)

            def to_runner_api_parameter(self, unused_context):
                return 'simple', None

            @staticmethod
            def from_runner_api_parameter(_0, _1, _2):
                return SimpleTransform()

        pipeline = TestPipeline(is_integration_test=True)

        res = (pipeline
               | beam.Create(['a', 'b'])
               | beam.ExternalTransform(
                   'simple', None,
                   expansion_service.ExpansionServiceServicer()))
        assert_that(res, equal_to(['Simple(a)', 'Simple(b)']))

        proto_pipeline, _ = pipeline.to_runner_api(return_context=True)
        pipeline_from_proto = Pipeline.from_runner_api(proto_pipeline,
                                                       pipeline.runner,
                                                       pipeline._options)
        pipeline_from_proto.run().wait_until_finish()
示例#9
0
 def test_payload(self):
     with beam.Pipeline() as p:
         res = (p
                | beam.Create(['a', 'bb'], reshuffle=False)
                | beam.ExternalTransform(
                    'payload', b's',
                    expansion_service.ExpansionServiceServicer()))
         assert_that(res, equal_to(['as', 'bbs']))
示例#10
0
 def run_flatten(self, pipeline):
     with pipeline as p:
         col1 = p | 'col1' >> beam.Create([1, 2, 3]).with_output_types(int)
         col2 = p | 'col2' >> beam.Create([4, 5, 6]).with_output_types(int)
         res = ((col1, col2)
                | beam.ExternalTransform(TEST_FLATTEN_URN, None,
                                         self.expansion_service))
         assert_that(res, equal_to([1, 2, 3, 4, 5, 6]))
示例#11
0
 def test_simple(self):
     with beam.Pipeline() as p:
         res = (p
                | beam.Create(['a', 'b'])
                | beam.ExternalTransform(
                    'simple', None,
                    expansion_service.ExpansionServiceServicer()))
         assert_that(res, equal_to(['Simple(a)', 'Simple(b)']))
示例#12
0
 def run_partition(self, pipeline):
     with pipeline as p:
         res = (p
                | beam.Create([1, 2, 3, 4, 5, 6]).with_output_types(int)
                | beam.ExternalTransform(TEST_PARTITION_URN, None,
                                         self.expansion_service))
         assert_that(res['0'], equal_to([2, 4, 6]), label='check_even')
         assert_that(res['1'], equal_to([1, 3, 5]), label='check_odd')
示例#13
0
    def test_java_expansion(self):
        if not self.expansion_service_jar:
            raise unittest.SkipTest('No expansion service jar provided.')

        # The actual definitions of these transforms is in
        # org.apache.beam.runners.core.construction.TestExpansionService.
        TEST_COUNT_URN = "pytest:beam:transforms:count"
        TEST_FILTER_URN = "pytest:beam:transforms:filter_less_than"

        # Run as cheaply as possible on the portable runner.
        # TODO(robertwb): Support this directly in the direct runner.
        options = beam.options.pipeline_options.PipelineOptions(
            runner='PortableRunner',
            experiments=['beam_fn_api'],
            environment_type=python_urns.EMBEDDED_PYTHON,
            job_endpoint='embed')

        try:
            # Start the java server and wait for it to be ready.
            port = '8091'
            address = 'localhost:%s' % port
            server = subprocess.Popen(
                ['java', '-jar', self.expansion_service_jar, port])
            with grpc.insecure_channel(address) as channel:
                grpc.channel_ready_future(channel).result()

            # Run a simple count-filtered-letters pipeline.
            with beam.Pipeline(options=options) as p:
                res = (
                    p
                    | beam.Create(list('aaabccxyyzzz'))
                    | beam.Map(unicode)
                    # TODO(BEAM-6587): Use strings directly rather than ints.
                    | beam.Map(lambda x: int(ord(x)))
                    | beam.ExternalTransform(TEST_FILTER_URN, b'middle',
                                             address)
                    | beam.ExternalTransform(TEST_COUNT_URN, None, address)
                    # TODO(BEAM-6587): Remove when above is removed.
                    | beam.Map(lambda kv: (chr(kv[0]), kv[1]))
                    | beam.Map(lambda kv: '%s: %s' % kv))

                assert_that(res, equal_to(['a: 3', 'b: 1', 'c: 2']))

        finally:
            server.kill()
示例#14
0
 def run_prefix(self, pipeline):
     with pipeline as p:
         res = (p
                | beam.Create(['a', 'b']).with_output_types(unicode)
                | beam.ExternalTransform(
                    TEST_PREFIX_URN,
                    ImplicitSchemaPayloadBuilder({'data': u'0'}),
                    self.expansion_service))
         assert_that(res, equal_to(['0a', '0b']))
示例#15
0
 def test_multi(self):
   with beam.Pipeline() as p:
     main1 = p | 'Main1' >> beam.Create(['a', 'bb'], reshuffle=False)
     main2 = p | 'Main2' >> beam.Create(['x', 'yy', 'zzz'], reshuffle=False)
     side = p | 'Side' >> beam.Create(['s'])
     res = dict(main1=main1, main2=main2, side=side) | beam.ExternalTransform(
         'multi', None, expansion_service.ExpansionServiceServicer())
     assert_that(res['main'], equal_to(['as', 'bbs', 'xs', 'yys', 'zzzs']))
     assert_that(res['side'], equal_to(['ss']), label='CheckSide')
示例#16
0
 def run_combine_per_key(self, pipeline):
     with pipeline as p:
         res = (p
                | beam.Create([
                    ('a', 1), ('a', 2), ('b', 3)
                ]).with_output_types(typing.Tuple[unicode, int])
                | beam.ExternalTransform(TEST_COMPK_URN, None,
                                         self.expansion_service))
         assert_that(res, equal_to([('a', 3), ('b', 3)]))
示例#17
0
    def run_pipelines(pipeline_options):
        # The actual definitions of these transforms is in
        # org.apache.beam.runners.core.construction.TestExpansionService.
        TEST_COUNT_URN = "pytest:beam:transforms:count"
        TEST_FILTER_URN = "pytest:beam:transforms:filter_less_than"

        assert (pipeline_options.view_as(StandardOptions).runner.lower() ==
                "portablerunner"), "Only PortableRunner is supported."

        try:

            # Run a simple count-filtered-letters pipeline.
            p = beam.Pipeline(options=pipeline_options)
            p.runner.init_dockerized_job_server()

            # Start the java server and wait for it to be ready.
            port = str(ExternalTransformTest.expansion_service_port)
            address = 'localhost:%s' % port
            server = subprocess.Popen([
                'java', '-jar', ExternalTransformTest.expansion_service_jar,
                port
            ])

            with grpc.insecure_channel(address) as channel:
                grpc.channel_ready_future(channel).result()

            res = (
                p
                | beam.Create(list('aaabccxyyzzz'))
                | beam.Map(unicode)
                # TODO(BEAM-6587): Use strings directly rather than ints.
                | beam.Map(lambda x: int(ord(x)))
                | beam.ExternalTransform(TEST_FILTER_URN, b'middle', address)
                | beam.ExternalTransform(TEST_COUNT_URN, None, address)
                # # TODO(BEAM-6587): Remove when above is removed.
                | beam.Map(lambda kv: (chr(kv[0]), kv[1]))
                | beam.Map(lambda kv: '%s: %s' % kv))

            assert_that(res, equal_to(['a: 3', 'b: 1', 'c: 2']))

            p.run().wait_until_finish()
        finally:
            server.kill()
示例#18
0
 def run_group_by_key(self, pipeline):
   with pipeline as p:
     res = (
         p
         | beam.Create([(0, "1"), (0, "2"),
                        (1, "3")], reshuffle=False).with_output_types(
                            typing.Tuple[int, unicode])
         | beam.ExternalTransform(TEST_GBK_URN, None, self.expansion_service)
         | beam.Map(lambda x: "{}:{}".format(x[0], ','.join(sorted(x[1])))))
     assert_that(res, equal_to(['0:1,2', '1:3']))
示例#19
0
    def test_external_transform_finder_leaf(self):
        pipeline = beam.Pipeline()
        _ = (pipeline
             | beam.Create(['a', 'b'])
             | beam.ExternalTransform(
                 'beam:transforms:xlang:test:nooutput',
                 ImplicitSchemaPayloadBuilder({'data': u'0'}),
                 expansion_service.ExpansionServiceServicer()))
        pipeline.run().wait_until_finish()

        self.assertTrue(pipeline.contains_external_transforms)
示例#20
0
 def run_multi_input_output_with_sideinput(self, pipeline):
   with pipeline as p:
     main1 = p | 'Main1' >> beam.Create(
         ['a', 'bb'], reshuffle=False).with_output_types(unicode)
     main2 = p | 'Main2' >> beam.Create(
         ['x', 'yy', 'zzz'], reshuffle=False).with_output_types(unicode)
     side = p | 'Side' >> beam.Create(['s']).with_output_types(unicode)
     res = dict(
         main1=main1, main2=main2, side=side) | beam.ExternalTransform(
             TEST_MULTI_URN, None, self.expansion_service)
     assert_that(res['main'], equal_to(['as', 'bbs', 'xs', 'yys', 'zzzs']))
     assert_that(res['side'], equal_to(['ss']), label='CheckSide')
 def test_combine_globally(self):
     test_pipeline = TestPipeline()
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments.append(
             'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
     test_pipeline.not_use_test_runner_api = True
     with test_pipeline as p:
         res = (p
                | beam.Create([1, 2, 3]).with_output_types(int)
                | beam.ExternalTransform(
                    TEST_COMGL_URN, None,
                    ValidateRunnerXlangTest.expansion_service))
         assert_that(res, equal_to([6]))
示例#22
0
 def run_cogroup_by_key(self, pipeline):
   with pipeline as p:
     col1 = p | 'create_col1' >> beam.Create(
         [(0, "1"), (0, "2"), (1, "3")], reshuffle=False).with_output_types(
             typing.Tuple[int, unicode])
     col2 = p | 'create_col2' >> beam.Create(
         [(0, "4"), (1, "5"), (1, "6")], reshuffle=False).with_output_types(
             typing.Tuple[int, unicode])
     res = (
         dict(col1=col1, col2=col2)
         | beam.ExternalTransform(TEST_CGBK_URN, None, self.expansion_service)
         | beam.Map(lambda x: "{}:{}".format(x[0], ','.join(sorted(x[1])))))
     assert_that(res, equal_to(['0:1,2,4', '1:3,5,6']))
 def test_flatten(self):
     test_pipeline = TestPipeline()
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments.append(
             'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
     test_pipeline.not_use_test_runner_api = True
     with test_pipeline as p:
         col1 = p | 'col1' >> beam.Create([1, 2, 3]).with_output_types(int)
         col2 = p | 'col2' >> beam.Create([4, 5, 6]).with_output_types(int)
         res = ((col1, col2)
                | beam.ExternalTransform(
                    TEST_FLATTEN_URN, None,
                    ValidateRunnerXlangTest.expansion_service))
         assert_that(res, equal_to([1, 2, 3, 4, 5, 6]))
 def test_partition(self):
     test_pipeline = TestPipeline()
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments.append(
             'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
     test_pipeline.not_use_test_runner_api = True
     with test_pipeline as p:
         res = (p
                | beam.Create([1, 2, 3, 4, 5, 6]).with_output_types(int)
                | beam.ExternalTransform(
                    TEST_PARTITION_URN, None,
                    ValidateRunnerXlangTest.expansion_service))
         assert_that(res['0'], equal_to([2, 4, 6]), label='check_even')
         assert_that(res['1'], equal_to([1, 3, 5]), label='check_odd')
 def test_prefix(self):
     test_pipeline = TestPipeline()
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments.append(
             'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
     test_pipeline.not_use_test_runner_api = True
     with test_pipeline as p:
         res = (p
                | beam.Create(['a', 'b']).with_output_types(unicode)
                | beam.ExternalTransform(
                    TEST_PREFIX_URN,
                    ImplicitSchemaPayloadBuilder({'data': u'0'}),
                    ValidateRunnerXlangTest.expansion_service))
         assert_that(res, equal_to(['0a', '0b']))
示例#26
0
 def test_combine_per_key(self):
   test_pipeline = TestPipeline()
   test_pipeline.get_pipeline_options().view_as(
       DebugOptions).experiments.append(
           'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
   test_pipeline.not_use_test_runner_api = True
   with test_pipeline as p:
     res = (
         p
         | beam.Create([('a', 1), ('a', 2), ('b', 3)]).with_output_types(
             typing.Tuple[unicode, int])
         | beam.ExternalTransform(
             TEST_COMPK_URN, None, ValidateRunnerXlangTest.expansion_service))
     assert_that(res, equal_to([('a', 3), ('b', 3)]))
示例#27
0
 def test_as_external_transform_no_kwargs(self):
   with FullyQualifiedNamedTransform.with_filter('*'):
     with beam.Pipeline() as p:
       assert_that(
           p
           | beam.Create(['a', 'b', 'c'])
           | beam.ExternalTransform(
               PYTHON_FULLY_QUALIFIED_NAMED_TRANSFORM_URN,
               ImplicitSchemaPayloadBuilder({
                   'constructor': 'apache_beam.transforms'
                   '.fully_qualified_named_transform_test._TestTransform',
                   'args': beam.Row(arg0='x', arg1='y'),
               }),
               expansion_service.ExpansionServiceServicer()),
           equal_to(['xay', 'xby', 'xcy']))
示例#28
0
 def run_flatten(self, pipeline):
   """
   Target transform - Flatten
   (https://beam.apache.org/documentation/programming-guide/#flatten)
   Test scenario - Merging multiple collections into a single collection
   Boundary conditions checked -
    - PCollectionList<?> to external transforms
    - PCollection<?> from external transforms
   """
   with pipeline as p:
     col1 = p | 'col1' >> beam.Create([1, 2, 3]).with_output_types(int)
     col2 = p | 'col2' >> beam.Create([4, 5, 6]).with_output_types(int)
     res = ((col1, col2)
            | beam.ExternalTransform(
                TEST_FLATTEN_URN, None, self.expansion_service))
     assert_that(res, equal_to([1, 2, 3, 4, 5, 6]))
示例#29
0
 def test_no_output_coder(self):
     external_transform = beam.ExternalTransform(
         'map_to_union_types', None,
         expansion_service.ExpansionServiceServicer())
     with beam.Pipeline() as p:
         res = (p | beam.Create([2, 2], reshuffle=False)
                | external_transform)
         assert_that(res, equal_to([2, 2]))
     context = pipeline_context.PipelineContext(
         external_transform._expanded_components)
     self.assertEqual(len(external_transform._expanded_transform.outputs),
                      1)
     for _, pcol_id in external_transform._expanded_transform.outputs.items(
     ):
         pcol = context.pcollections.get_by_id(pcol_id)
         self.assertEqual(pcol.element_type, typehints.Any)
 def run_combine_globally(self, pipeline):
     """
 Target transform - Combine
 (https://beam.apache.org/documentation/programming-guide/#combine)
 Test scenario - Combining elements globally with a predefined simple
 CombineFn
 Boundary conditions checked -
  - PCollection<?> to external transforms
  - PCollection<?> from external transforms
 """
     with pipeline as p:
         res = (p
                | beam.Create([1, 2, 3]).with_output_types(int)
                | beam.ExternalTransform(TEST_COMGL_URN, None,
                                         self.expansion_service))
         assert_that(res, equal_to([6]))