def test_cacheables(self):
        p = beam.Pipeline(interactive_runner.InteractiveRunner())
        ie.current_env().set_cache_manager(InMemoryCache(), p)
        # pylint: disable=range-builtin-not-iterating
        init_pcoll = p | 'Init Create' >> beam.Create(range(10))
        squares = init_pcoll | 'Square' >> beam.Map(lambda x: x * x)
        cubes = init_pcoll | 'Cube' >> beam.Map(lambda x: x**3)
        ib.watch(locals())

        pipeline_instrument = instr.build_pipeline_instrument(p)

        # TODO(BEAM-7760): The PipelineInstrument cacheables maintains a global list
        # of cacheable PCollections across all pipelines. Here we take the subset of
        # cacheables that only pertain to this test's pipeline.
        cacheables = {
            k: c
            for k, c in pipeline_instrument.cacheables.items()
            if c.pcoll.pipeline is p
        }

        self.assertEqual(
            cacheables, {
                pipeline_instrument._cacheable_key(init_pcoll):
                instr.Cacheable(var='init_pcoll',
                                version=str(id(init_pcoll)),
                                pcoll_id='ref_PCollection_PCollection_8',
                                producer_version=str(id(init_pcoll.producer)),
                                pcoll=init_pcoll),
                pipeline_instrument._cacheable_key(squares):
                instr.Cacheable(var='squares',
                                version=str(id(squares)),
                                pcoll_id='ref_PCollection_PCollection_9',
                                producer_version=str(id(squares.producer)),
                                pcoll=squares),
                pipeline_instrument._cacheable_key(cubes):
                instr.Cacheable(var='cubes',
                                version=str(id(cubes)),
                                pcoll_id='ref_PCollection_PCollection_10',
                                producer_version=str(id(cubes.producer)),
                                pcoll=cubes)
            })
 def test_get_master_url_no_flink_master_and_master_url_exists(self):
     from apache_beam.runners.portability.flink_runner import FlinkRunner
     runner = interactive_runner.InteractiveRunner(
         underlying_runner=FlinkRunner())
     p = beam.Pipeline(options=PipelineOptions(
         project='test-project',
         region='test-region',
     ))
     cluster_name = ie.current_env().clusters.default_cluster_name
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region',
                                            cluster_name=cluster_name)
     ie.current_env().clusters.master_urls['test-url'] = cluster_metadata
     ie.current_env(
     ).clusters.master_urls_to_dashboards['test-url'] = 'test-dashboard'
     flink_master = runner._get_dataproc_cluster_master_url_if_applicable(p)
     self.assertEqual(
         ie.current_env().clusters.describe(p)
         ['cluster_metadata'].project_id, 'test-project')
     self.assertEqual(flink_master,
                      ie.current_env().clusters.describe(p)['master_url'])
    def test_mark_pcollection_completed_after_successful_run(self, cell):
        with cell:  # Cell 1
            p = beam.Pipeline(interactive_runner.InteractiveRunner())
            ib.watch({'p': p})

        with cell:  # Cell 2
            # pylint: disable=range-builtin-not-iterating
            init = p | 'Init' >> beam.Create(range(5))

        with cell:  # Cell 3
            square = init | 'Square' >> beam.Map(lambda x: x * x)
            cube = init | 'Cube' >> beam.Map(lambda x: x**3)

        ib.watch(locals())
        result = p.run()
        self.assertTrue(init in ie.current_env().computed_pcollections)
        self.assertEqual({0, 1, 2, 3, 4}, set(result.get(init)))
        self.assertTrue(square in ie.current_env().computed_pcollections)
        self.assertEqual({0, 1, 4, 9, 16}, set(result.get(square)))
        self.assertTrue(cube in ie.current_env().computed_pcollections)
        self.assertEqual({0, 1, 8, 27, 64}, set(result.get(cube)))
示例#4
0
  def test_wordcount(self):

    class WordExtractingDoFn(beam.DoFn):

      def process(self, element):
        text_line = element.strip()
        words = text_line.split()
        return words

    p = beam.Pipeline(
        runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

    # Count the occurrences of each word.
    counts = (
        p
        | beam.Create(['to be or not to be that is the question'])
        | 'split' >> beam.ParDo(WordExtractingDoFn())
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1]))))

    # Watch the local scope for Interactive Beam so that counts will be cached.
    ib.watch(locals())

    result = p.run()
    result.wait_until_finish()

    actual = dict(result.get(counts))
    self.assertDictEqual(
        actual, {
            'to': 2,
            'be': 2,
            'or': 1,
            'not': 1,
            'that': 1,
            'is': 1,
            'the': 1,
            'question': 1
        })
示例#5
0
  def test_get_pcoll_data(self):
    pipeline = beam.Pipeline(ir.InteractiveRunner())
    # pylint: disable=bad-option-value
    pcoll = pipeline | 'Create' >> beam.Create(list(range(10)))
    counts = pcoll | beam.combiners.Count.PerElement()

    ib.watch(locals())
    ie.current_env().track_user_pipelines()
    counts_identifier = obfuscate(inspector.meta('counts', counts))
    ins = inspector.InteractiveEnvironmentInspector()
    _ = ins.list_inspectables()

    actual_counts_pcoll_data = ins.get_pcoll_data(counts_identifier)
    expected_counts_pcoll_data = ib.collect(
        counts, n=10).to_json(orient='table')
    self.assertEqual(actual_counts_pcoll_data, expected_counts_pcoll_data)

    actual_counts_with_window_info = ins.get_pcoll_data(counts_identifier, True)
    expected_counts_with_window_info = ib.collect(
        counts, include_window_info=True).to_json(orient='table')
    self.assertEqual(
        actual_counts_with_window_info, expected_counts_with_window_info)
示例#6
0
  def test_wordcount(self):

    class WordExtractingDoFn(beam.DoFn):

      def process(self, element):
        text_line = element.strip()
        words = text_line.split()
        return words

    # TODO(qinyeli, BEAM-4755) remove explicitly overriding underlying runner
    # once interactive_runner works with FnAPI mode
    p = beam.Pipeline(
        runner=interactive_runner.InteractiveRunner(
            direct_runner.BundleBasedDirectRunner()))

    # Count the occurrences of each word.
    counts = (
        p
        | beam.Create(['to be or not to be that is the question'])
        | 'split' >> beam.ParDo(WordExtractingDoFn())
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1]))))

    result = p.run()
    result.wait_until_finish()

    actual = dict(result.get(counts))
    self.assertDictEqual(
        actual, {
            'to': 2,
            'be': 2,
            'or': 1,
            'not': 1,
            'that': 1,
            'is': 1,
            'the': 1,
            'question': 1
        })
示例#7
0
    def test_dataframes_same_cell_twice(self):
        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))
        data = p | beam.Create([
            1, 2, 3
        ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))
        df = to_dataframe(data)

        # Watch the local scope for Interactive Beam so that values will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]})
        pd.testing.assert_series_equal(
            df_expected['square'],
            ib.collect(df['square'], n=10).reset_index(drop=True))
        pd.testing.assert_series_equal(
            df_expected['cube'],
            ib.collect(df['cube'], n=10).reset_index(drop=True))
    def test_get_dot(self):
        p = beam.Pipeline(ir.InteractiveRunner())
        init_pcoll = p | 'Init' >> beam.Create(range(10))
        squares = init_pcoll | 'Square' >> beam.Map(lambda x: x * x)
        cubes = init_pcoll | 'Cube' >> beam.Map(lambda x: x**3)
        ib.watch(locals())

        self.assertEqual(('digraph G {\n'
                          'node [color=blue, fontcolor=blue, shape=box];\n'
                          '"Init";\n'
                          'init_pcoll [shape=circle];\n'
                          '"Square";\n'
                          'squares [shape=circle];\n'
                          '"Cube";\n'
                          'cubes [shape=circle];\n'
                          '"Init" -> init_pcoll;\n'
                          'init_pcoll -> "Square";\n'
                          'init_pcoll -> "Cube";\n'
                          '"Square" -> squares;\n'
                          '"Cube" -> cubes;\n'
                          '}\n'),
                         pipeline_graph.PipelineGraph(p).get_dot())
示例#9
0
  def test_get_dot_within_notebook(self, cell):
    # Assume a mocked ipython kernel and notebook frontend have been set up.
    ie.current_env()._is_in_ipython = True
    ie.current_env()._is_in_notebook = True
    with cell:  # Cell 1
      p = beam.Pipeline(ir.InteractiveRunner())
      # Immediately track this local pipeline so that ipython prompts when
      # applying transforms will be tracked and used for labels.
      ib.watch(locals())

    with cell:  # Cell 2
      init_pcoll = p | 'Init' >> beam.Create(range(10))

    with cell:  # Cell 3
      squares = init_pcoll | 'Square' >> beam.Map(lambda x: x * x)

    with cell:  # Cell 4
      cubes = init_pcoll | 'Cube' >> beam.Map(lambda x: x**3)

    # Tracks all PCollections defined so far.
    ib.watch(locals())
    self.assertEqual((
        'digraph G {\n'
        'node [color=blue, fontcolor=blue, shape=box];\n'
        '"[2]: Init";\n'
        'init_pcoll [shape=circle];\n'
        '"[3]: Square";\n'
        'squares [shape=circle];\n'
        '"[4]: Cube";\n'
        'cubes [shape=circle];\n'
        '"[2]: Init" -> init_pcoll;\n'
        'init_pcoll -> "[3]: Square";\n'
        'init_pcoll -> "[4]: Cube";\n'
        '"[3]: Square" -> squares;\n'
        '"[4]: Cube" -> cubes;\n'
        '}\n'),
                     pipeline_graph.PipelineGraph(p).get_dot())
    def test_instrument_example_unbounded_pipeline_to_multiple_read_cache(
            self):
        """Tests that the instrumenter works for multiple unbounded sources.
    """
        # Create the pipeline that will be instrumented.
        p_original = beam.Pipeline(interactive_runner.InteractiveRunner())
        ie.current_env().set_cache_manager(StreamingCache(cache_dir=None),
                                           p_original)
        source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        source_2 = p_original | 'source2' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        # pylint: disable=possibly-unused-variable
        pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x)
        # pylint: disable=possibly-unused-variable
        pcoll_2 = source_2 | 'square2' >> beam.Map(lambda x: x * x)

        # Mock as if cacheable PCollections are cached.
        ib.watch(locals())
        # This should be noop.
        utils.watch_sources(p_original)
        for name, pcoll in locals().items():
            if not isinstance(pcoll, beam.pvalue.PCollection):
                continue
            cache_key = self.cache_key_of(name, pcoll)
            self._mock_write_cache(p_original, [], cache_key)

        # Instrument the original pipeline to create the pipeline the user will see.
        instrumenter = instr.build_pipeline_instrument(p_original)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=None)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        source_1_cache_key = self.cache_key_of('source_1', source_1)
        source_2_cache_key = self.cache_key_of('source_2', source_2)
        p_expected = beam.Pipeline()
        test_stream = (p_expected
                       | TestStream(output_tags=[
                           self.cache_key_of('source_1', source_1),
                           self.cache_key_of('source_2', source_2)
                       ]))
        # pylint: disable=expression-not-assigned
        test_stream[source_1_cache_key] | 'square1' >> beam.Map(
            lambda x: x * x)
        # pylint: disable=expression-not-assigned
        test_stream[source_2_cache_key] | 'square2' >> beam.Map(
            lambda x: x * x)

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([source_1_cache_key, source_2_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(self, p_expected.to_runner_api(),
                                    instrumenter.instrumented_pipeline_proto())
    def test_instrument_example_unbounded_pipeline_to_read_cache_not_cached(
            self):
        """Tests that the instrumenter works when the PCollection is not cached.
    """
        # Create the pipeline that will be instrumented.
        from apache_beam.options.pipeline_options import StandardOptions
        options = StandardOptions(streaming=True)
        p_original_read_cache = beam.Pipeline(
            interactive_runner.InteractiveRunner(), options)
        ie.current_env().set_cache_manager(StreamingCache(cache_dir=None),
                                           p_original_read_cache)
        source_1 = p_original_read_cache | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        # pylint: disable=possibly-unused-variable
        pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x)

        # Watch but do not cache the PCollections.
        ib.watch(locals())
        # This should be noop.
        utils.watch_sources(p_original_read_cache)
        # Instrument the original pipeline to create the pipeline the user will see.
        p_copy = beam.Pipeline.from_runner_api(
            p_original_read_cache.to_runner_api(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        ie.current_env().add_derived_pipeline(p_original_read_cache, p_copy)
        instrumenter = instr.build_pipeline_instrument(p_copy)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        source_1_cache_key = self.cache_key_of('source_1', source_1)
        p_expected = beam.Pipeline()
        ie.current_env().set_cache_manager(StreamingCache(cache_dir=None),
                                           p_expected)
        test_stream = (p_expected
                       | TestStream(output_tags=[source_1_cache_key]))
        # pylint: disable=expression-not-assigned
        (test_stream[source_1_cache_key]
         | 'square1' >> beam.Map(lambda x: x * x)
         | 'reify' >> beam.Map(lambda _: _)
         | cache.WriteCache(ie.current_env().get_cache_manager(p_expected),
                            'unused'))

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([source_1_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(self, p_expected.to_runner_api(),
                                    instrumenter.instrumented_pipeline_proto())
    def test_instrument_mixed_streaming_batch(self):
        """Tests caching for both batch and streaming sources in the same pipeline.

    This ensures that cached bounded and unbounded sources are read from the
    TestStream.
    """
        # Create the pipeline that will be instrumented.
        from apache_beam.options.pipeline_options import StandardOptions
        options = StandardOptions(streaming=True)
        p_original = beam.Pipeline(interactive_runner.InteractiveRunner(),
                                   options)
        streaming_cache_manager = StreamingCache(cache_dir=None)
        ie.current_env().set_cache_manager(streaming_cache_manager, p_original)
        source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        source_2 = p_original | 'source2' >> beam.Create([1, 2, 3, 4, 5])

        # pylint: disable=possibly-unused-variable
        pcoll_1 = ((source_1, source_2)
                   | beam.Flatten()
                   | 'square1' >> beam.Map(lambda x: x * x))

        # Watch but do not cache the PCollections.
        ib.watch(locals())
        # This should be noop.
        utils.watch_sources(p_original)
        self._mock_write_cache(p_original, [],
                               self.cache_key_of('source_2', source_2))
        ie.current_env().mark_pcollection_computed([source_2])

        # Instrument the original pipeline to create the pipeline the user will see.
        p_copy = beam.Pipeline.from_runner_api(
            p_original.to_runner_api(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        ie.current_env().add_derived_pipeline(p_original, p_copy)
        instrumenter = instr.build_pipeline_instrument(p_copy)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        source_1_cache_key = self.cache_key_of('source_1', source_1)
        source_2_cache_key = self.cache_key_of('source_2', source_2)
        p_expected = beam.Pipeline()
        ie.current_env().set_cache_manager(streaming_cache_manager, p_expected)
        test_stream = (
            p_expected
            | TestStream(output_tags=[source_1_cache_key, source_2_cache_key]))
        # pylint: disable=expression-not-assigned
        ((test_stream[self.cache_key_of('source_1', source_1)],
          test_stream[self.cache_key_of('source_2', source_2)])
         | beam.Flatten()
         | 'square1' >> beam.Map(lambda x: x * x)
         | 'reify' >> beam.Map(lambda _: _)
         | cache.WriteCache(ie.current_env().get_cache_manager(p_expected),
                            'unused'))

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([source_1_cache_key, source_2_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(self, p_expected.to_runner_api(),
                                    instrumenter.instrumented_pipeline_proto())
    def test_able_to_cache_intermediate_unbounded_source_pcollection(self):
        """Tests being able to cache an intermediate source PCollection.

    In the following pipeline, the source doesn't have a reference and so is
    not automatically cached in the watch() command. This tests that this case
    is taken care of.
    """
        # Create the pipeline that will be instrumented.
        from apache_beam.options.pipeline_options import StandardOptions
        options = StandardOptions(streaming=True)
        streaming_cache_manager = StreamingCache(cache_dir=None)
        p_original_cache_source = beam.Pipeline(
            interactive_runner.InteractiveRunner(), options)
        ie.current_env().set_cache_manager(streaming_cache_manager,
                                           p_original_cache_source)

        # pylint: disable=possibly-unused-variable
        source_1 = (
            p_original_cache_source
            | 'source1' >> beam.io.ReadFromPubSub(
                subscription='projects/fake-project/subscriptions/fake_sub')
            | beam.Map(lambda e: e))

        # Watch but do not cache the PCollections.
        ib.watch(locals())
        # Make sure that sources without a user reference are still cached.
        utils.watch_sources(p_original_cache_source)

        intermediate_source_pcoll = None
        for watching in ie.current_env().watching():
            watching = list(watching)
            for var, watchable in watching:
                if 'synthetic' in var:
                    intermediate_source_pcoll = watchable
                    break

        # Instrument the original pipeline to create the pipeline the user will see.
        p_copy = beam.Pipeline.from_runner_api(
            p_original_cache_source.to_runner_api(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        ie.current_env().add_derived_pipeline(p_original_cache_source, p_copy)
        instrumenter = instr.build_pipeline_instrument(p_copy)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        ie.current_env().add_derived_pipeline(p_original_cache_source,
                                              actual_pipeline)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        intermediate_source_pcoll_cache_key = \
            self.cache_key_of('synthetic_var_' + str(id(intermediate_source_pcoll)),
                         intermediate_source_pcoll)
        p_expected = beam.Pipeline()
        ie.current_env().set_cache_manager(streaming_cache_manager, p_expected)
        test_stream = (
            p_expected
            | TestStream(output_tags=[intermediate_source_pcoll_cache_key]))
        # pylint: disable=expression-not-assigned
        (test_stream[intermediate_source_pcoll_cache_key]
         | 'square1' >> beam.Map(lambda e: e)
         | 'reify' >> beam.Map(lambda _: _)
         | cache.WriteCache(ie.current_env().get_cache_manager(p_expected),
                            'unused'))

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([intermediate_source_pcoll_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(self, p_expected.to_runner_api(),
                                    instrumenter.instrumented_pipeline_proto())
示例#14
0
    def test_streaming_wordcount(self):
        self.skipTest('[BEAM-9601] Test is breaking PreCommits')

        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)
        ib.options.capture_duration = timedelta(seconds=1)

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(),
                          options=StandardOptions(streaming=True))

        data = (
            p
            | TestStream()
                .advance_watermark_to(0)
                .advance_processing_time(1)
                .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
                .advance_watermark_to(20)
                .advance_processing_time(1)
                .add_elements(['that', 'is', 'the', 'question'])
            | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable

        counts = (data
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # This tests that the data was correctly cached.
        pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
        expected_data_df = pd.DataFrame(
            [('to', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('be', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('or', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('not', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('to', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('be', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('that', 20000000, [beam.window.IntervalWindow(20, 30)
                                 ], pane_info),
             ('is', 20000000, [beam.window.IntervalWindow(20, 30)], pane_info),
             ('the', 20000000, [beam.window.IntervalWindow(20, 30)
                                ], pane_info),
             ('question', 20000000, [beam.window.IntervalWindow(20, 30)
                                     ], pane_info)],
            columns=[0, 'event_time', 'windows', 'pane_info'])

        data_df = ib.collect(data, include_window_info=True)
        pd.testing.assert_frame_equal(expected_data_df, data_df)

        # This tests that the windowing was passed correctly so that all the data
        # is aggregated also correctly.
        pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
        expected_counts_df = pd.DataFrame([
            ('to', 2, 9999999, [beam.window.IntervalWindow(0, 10)], pane_info),
            ('be', 2, 9999999, [beam.window.IntervalWindow(0, 10)], pane_info),
            ('or', 1, 9999999, [beam.window.IntervalWindow(0, 10)], pane_info),
            ('not', 1, 9999999, [beam.window.IntervalWindow(0, 10)
                                 ], pane_info),
            ('that', 1, 29999999, [beam.window.IntervalWindow(20, 30)
                                   ], pane_info),
            ('is', 1, 29999999, [beam.window.IntervalWindow(20, 30)
                                 ], pane_info),
            ('the', 1, 29999999, [beam.window.IntervalWindow(20, 30)
                                  ], pane_info),
            ('question', 1, 29999999, [beam.window.IntervalWindow(20, 30)
                                       ], pane_info)
        ],
                                          columns=[
                                              0, 1, 'event_time', 'windows',
                                              'pane_info'
                                          ])

        counts_df = ib.collect(counts, include_window_info=True)
        pd.testing.assert_frame_equal(expected_counts_df, counts_df)
示例#15
0
    def test_instrument_example_unbounded_pipeline_direct_from_source(self):
        """Tests that the it caches PCollections from a source.
        """
        # Create a new interactive environment to make the test idempotent.
        ie.new_env(cache_manager=streaming_cache.StreamingCache(
            cache_dir=None))

        # Create the pipeline that will be instrumented.
        from apache_beam.options.pipeline_options import StandardOptions
        options = StandardOptions(streaming=True)
        p_original = beam.Pipeline(interactive_runner.InteractiveRunner(),
                                   options)
        source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        # pylint: disable=possibly-unused-variable

        # Watch but do not cache the PCollections.
        ib.watch(locals())

        def cache_key_of(name, pcoll):
            return name + '_' + str(id(pcoll)) + '_' + str(id(pcoll.producer))

        # Instrument the original pipeline to create the pipeline the user will see.
        p_copy = beam.Pipeline.from_runner_api(
            p_original.to_runner_api(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        instrumenter = instr.build_pipeline_instrument(p_copy)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        source_1_cache_key = cache_key_of('source_1', source_1)
        p_expected = beam.Pipeline()

        # pylint: disable=unused-variable
        test_stream = (
            p_expected
            | TestStream(output_tags=[cache_key_of('source_1', source_1)]))

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([source_1_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(
            self, p_expected.to_runner_api(use_fake_coders=True),
            instrumenter.instrumented_pipeline_proto())
def _build_an_empty_stream_pipeline():
    pipeline_options = PipelineOptions(streaming=True)
    p = beam.Pipeline(interactive_runner.InteractiveRunner(),
                      options=pipeline_options)
    ib.watch({'pipeline': p})
    return p
示例#17
0
 def test_has_unbounded_source(self):
     p = beam.Pipeline(interactive_runner.InteractiveRunner())
     ie.current_env().set_cache_manager(InMemoryCache(), p)
     _ = p | 'ReadUnboundedSource' >> beam.io.ReadFromPubSub(
         subscription='projects/fake-project/subscriptions/fake_sub')
     self.assertTrue(instr.has_unbounded_sources(p))
示例#18
0
    def test_dataframe_caching(self, cell):

        # Create a pipeline that exercises the DataFrame API. This will also use
        # caching in the background.
        with cell:  # Cell 1
            p = beam.Pipeline(interactive_runner.InteractiveRunner())
            ib.watch({'p': p})

        with cell:  # Cell 2
            data = p | beam.Create([
                1, 2, 3
            ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))

            with beam.dataframe.allow_non_parallel_operations():
                df = to_dataframe(data).reset_index(drop=True)

            ib.collect(df)

        with cell:  # Cell 3
            df['output'] = df['square'] * df['cube']
            ib.collect(df)

        with cell:  # Cell 4
            df['output'] = 0
            ib.collect(df)

        # We use a trace through the graph to perform an isomorphism test. The end
        # output should look like a linear graph. This indicates that the dataframe
        # transform was correctly broken into separate pieces to cache. If caching
        # isn't enabled, all the dataframe computation nodes are connected to a
        # single shared node.
        trace = []

        # Only look at the top-level transforms for the isomorphism. The test
        # doesn't care about the transform implementations, just the overall shape.
        class TopLevelTracer(beam.pipeline.PipelineVisitor):
            def _find_root_producer(self,
                                    node: beam.pipeline.AppliedPTransform):
                if node is None or not node.full_label:
                    return None

                parent = self._find_root_producer(node.parent)
                if parent is None:
                    return node

                return parent

            def _add_to_trace(self, node, trace):
                if '/' not in str(node):
                    if node.inputs:
                        producer = self._find_root_producer(
                            node.inputs[0].producer)
                        producer_name = producer.full_label if producer else ''
                        trace.append((producer_name, node.full_label))

            def visit_transform(self, node: beam.pipeline.AppliedPTransform):
                self._add_to_trace(node, trace)

            def enter_composite_transform(
                    self, node: beam.pipeline.AppliedPTransform):
                self._add_to_trace(node, trace)

        p.visit(TopLevelTracer())

        # Do the isomorphism test which states that the topological sort of the
        # graph yields a linear graph.
        trace_string = '\n'.join(str(t) for t in trace)
        prev_producer = ''
        for producer, consumer in trace:
            self.assertEqual(producer, prev_producer, trace_string)
            prev_producer = consumer
示例#19
0
    def test_recordings_record(self):
        """Tests that recording pipeline succeeds."""

        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)

        # Create a pipeline with an arbitrary amonunt of elements.
        p = beam.Pipeline(ir.InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        # pylint: disable=unused-variable
        _ = (p
             | TestStream()
                 .advance_watermark_to(0)
                 .advance_processing_time(1)
                 .add_elements(list(range(10)))
                 .advance_processing_time(1))  # yapf: disable
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Assert that the pipeline starts in a good state.
        self.assertEqual(
            ib.recordings.describe(p)['state'], PipelineState.STOPPED)
        self.assertEqual(ib.recordings.describe(p)['size'], 0)

        # Create a lmiter that stops the background caching job when something is
        # written to cache. This is used to make ensure that the pipeline is
        # functioning properly and that there are no data races with the test.
        class SizeLimiter(Limiter):
            def __init__(self, pipeline):
                self.pipeline = pipeline
                self.should_trigger = False

            def is_triggered(self):
                return (ib.recordings.describe(self.pipeline)['size'] > 0
                        and self.should_trigger)

        limiter = SizeLimiter(p)
        ib.options.capture_control.set_limiters_for_test([limiter])

        # Assert that a recording can be started only once.
        self.assertTrue(ib.recordings.record(p))
        self.assertFalse(ib.recordings.record(p))
        self.assertEqual(
            ib.recordings.describe(p)['state'], PipelineState.RUNNING)

        # Wait for the pipeline to start and write something to cache.
        limiter.should_trigger = True
        for _ in range(60):
            if limiter.is_triggered():
                break
            time.sleep(1)
        self.assertTrue(
            limiter.is_triggered(),
            'Test timed out waiting for limiter to be triggered. This indicates '
            'that the BackgroundCachingJob did not cache anything.')

        # Assert that a recording can be stopped and can't be started again until
        # after the cache is cleared.
        ib.recordings.stop(p)
        self.assertEqual(
            ib.recordings.describe(p)['state'], PipelineState.STOPPED)
        self.assertFalse(ib.recordings.record(p))
        ib.recordings.clear(p)
        self.assertTrue(ib.recordings.record(p))
        ib.recordings.stop(p)
    def test_streaming_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(),
                          options=StandardOptions(streaming=True))

        data = (
            p
            | TestStream()
                .advance_watermark_to(0)
                .advance_processing_time(1)
                .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
                .advance_watermark_to(20)
                .advance_processing_time(1)
                .add_elements(['that', 'is', 'the', 'question'])
            | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable

        counts = (data
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Create a fake limiter that cancels the BCJ once the main job receives the
        # expected amount of results.
        class FakeLimiter:
            def __init__(self, p, pcoll):
                self.p = p
                self.pcoll = pcoll

            def is_triggered(self):
                result = ie.current_env().pipeline_result(self.p)
                if result:
                    try:
                        results = result.get(self.pcoll)
                    except ValueError:
                        return False
                    return len(results) >= 10
                return False

        # This sets the limiters to stop reading when the test receives 10 elements.
        ie.current_env().options.capture_control.set_limiters_for_test(
            [FakeLimiter(p, data)])

        # This tests that the data was correctly cached.
        pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
        expected_data_df = pd.DataFrame([
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('or', 0, [IntervalWindow(0, 10)], pane_info),
            ('not', 0, [IntervalWindow(0, 10)], pane_info),
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('that', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('is', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('the', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('question', 20000000, [IntervalWindow(20, 30)], pane_info)
        ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable

        data_df = ib.collect(data, include_window_info=True)
        pd.testing.assert_frame_equal(expected_data_df, data_df)

        # This tests that the windowing was passed correctly so that all the data
        # is aggregated also correctly.
        pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
        expected_counts_df = pd.DataFrame([
            ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable

        counts_df = ib.collect(counts, include_window_info=True)

        # The group by key has no guarantee of order. So we post-process the DF by
        # sorting so we can test equality.
        sorted_counts_df = (counts_df
                            .sort_values(['event_time', 0], ascending=True)
                            .reset_index(drop=True)) # yapf: disable
        pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
示例#21
0
 def test_has_unbounded_source(self):
     p = beam.Pipeline(interactive_runner.InteractiveRunner())
     _ = p | 'ReadUnboundedSource' >> beam.io.ReadFromPubSub(
         subscription='projects/fake-project/subscriptions/fake_sub')
     self.assertTrue(instr.has_unbounded_sources(p))
    def test_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        # Count the occurrences of each word.
        counts = (p
                  | beam.Create(['to be or not to be that is the question'])
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that counts will be cached.
        ib.watch(locals())

        result = p.run()
        result.wait_until_finish()

        actual = list(result.get(counts))
        self.assertSetEqual(
            set(actual),
            set([
                ('or', 1),
                ('that', 1),
                ('be', 2),
                ('is', 1),
                ('question', 1),
                ('to', 2),
                ('the', 1),
                ('not', 1),
            ]))

        # Truncate the precision to millis because the window coder uses millis
        # as units then gets upcast to micros.
        end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
        df_counts = ib.collect(counts, include_window_info=True)
        df_expected = pd.DataFrame(
            {
                0: [e[0] for e in actual],
                1: [e[1] for e in actual],
                'event_time': [end_of_window for _ in actual],
                'windows': [[GlobalWindow()] for _ in actual],
                'pane_info': [
                    PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)
                    for _ in actual
                ]
            },
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])

        pd.testing.assert_frame_equal(df_expected, df_counts)

        actual_reified = result.get(counts, include_window_info=True)
        expected_reified = [
            WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()],
                          PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))
            for e in actual
        ]
        self.assertEqual(actual_reified, expected_reified)
示例#23
0
 def test_not_has_unbounded_source(self):
     p = beam.Pipeline(interactive_runner.InteractiveRunner())
     with tempfile.NamedTemporaryFile(delete=False) as f:
         f.write(b'test')
     _ = p | 'ReadBoundedSource' >> beam.io.ReadFromText(f.name)
     self.assertFalse(instr.has_unbounded_sources(p))