Exemplo n.º 1
0
    def test_parse_windowedvalue_with_dicts(self):
        """Tests that dicts play well with WindowedValues.
    """
        els = [
            windowed_value({
                'b': 2,
                'd': 4
            }),
            windowed_value({
                'a': 1,
                'b': 2,
                'c': 3
            })
        ]

        actual_df = utils.elements_to_df(els, include_window_info=True)
        expected_df = pd.DataFrame(
            [[
                np.nan, 2, np.nan, 4,
                int(1e6), els[0].windows, els[0].pane_info
            ], [1, 2, 3, np.nan,
                int(1e6), els[1].windows, els[1].pane_info]],
            columns=['a', 'b', 'c', 'd', 'event_time', 'windows', 'pane_info'])
        # check_like so that ordering of indices doesn't matter.
        pd.testing.assert_frame_equal(actual_df, expected_df, check_like=True)
Exemplo n.º 2
0
    def test_parse_windowedvalue_with_dicts(self):
        """Tests that dicts play well with WindowedValues.
    """
        from apache_beam.transforms.window import GlobalWindow

        els = [
            WindowedValue({
                'b': 2,
                'd': 4
            }, 1, [GlobalWindow()]),
            WindowedValue({
                'a': 1,
                'b': 2,
                'c': 3
            }, 1, [GlobalWindow()])
        ]

        actual_df = utils.elements_to_df(els, include_window_info=True)
        expected_df = pd.DataFrame(
            [[
                np.nan, 2, np.nan, 4,
                int(1e6), els[0].windows, els[0].pane_info
            ], [1, 2, 3, np.nan,
                int(1e6), els[1].windows, els[1].pane_info]],
            columns=['a', 'b', 'c', 'd', 'event_time', 'windows', 'pane_info'])
        pd.testing.assert_frame_equal(actual_df, expected_df)
Exemplo n.º 3
0
def collect(pcoll, n='inf', duration='inf', include_window_info=False):
    """Materializes the elements from a PCollection into a Dataframe.

  This reads each element from file and reads only the amount that it needs
  into memory. The user can specify either the max number of elements to read
  or the maximum duration of elements to read. When a limiter is not supplied,
  it is assumed to be infinite.

  Args:
    n: (optional) max number of elements to visualize. Default 'inf'.
    duration: (optional) max duration of elements to read. Default 'inf'.

  For example::

    p = beam.Pipeline(InteractiveRunner())
    init = p | 'Init' >> beam.Create(range(10))
    square = init | 'Square' >> beam.Map(lambda x: x * x)

    # Run the pipeline and bring the PCollection into memory as a Dataframe.
    in_memory_square = head(square, n=5)
  """
    assert isinstance(pcoll, beam.pvalue.PCollection), (
        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    if isinstance(n, str):
        assert n == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(n, int):
        assert n > 0, 'n needs to be positive or the string \'inf\''

    if isinstance(duration, str):
        assert duration == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(duration, int):
        assert duration > 0, 'duration needs to be positive or the string \'inf\''

    if n == 'inf':
        n = float('inf')

    if duration == 'inf':
        duration = float('inf')

    user_pipeline = pcoll.pipeline
    recording_manager = ie.current_env().get_recording_manager(
        user_pipeline, create_if_absent=True)

    recording = recording_manager.record([pcoll],
                                         max_n=n,
                                         max_duration_secs=duration)

    try:
        elements = list(recording.stream(pcoll).read())
    except KeyboardInterrupt:
        recording.cancel()
        return pd.DataFrame()

    return elements_to_df(elements, include_window_info=include_window_info)
Exemplo n.º 4
0
  def _to_dataframe(self):
    results = []
    cache_manager = ie.current_env().cache_manager()
    if cache_manager.exists('full', self._cache_key):
      coder = cache_manager.load_pcoder('full', self._cache_key)
      reader, _ = cache_manager.read('full', self._cache_key)
      results = list(to_element_list(reader, coder, include_window_info=True))

    return elements_to_df(results, self._include_window_info)
Exemplo n.º 5
0
    def test_parse_windowedvalue(self):
        """Tests that WindowedValues are supported but not present.
    """

        els = [windowed_value(('a', 2)), windowed_value(('b', 3))]

        actual_df = utils.elements_to_df(els, include_window_info=False)
        expected_df = pd.DataFrame([['a', 2], ['b', 3]], columns=[0, 1])
        # check_like so that ordering of indices doesn't matter.
        pd.testing.assert_frame_equal(actual_df, expected_df, check_like=True)
Exemplo n.º 6
0
  def test_parse_dataframes(self):
    """Tests that it correctly parses a DataFrame.
    """
    deferred = to_dataframe(beam.Pipeline() | beam.Create([Record(0, 0, 0)]))

    els = [windowed_value(pd.DataFrame(Record(n, 0, 0))) for n in range(10)]

    actual_df = utils.elements_to_df(
        els, element_type=deferred._expr.proxy()).reset_index(drop=True)
    expected_df = pd.concat([e.value for e in els], ignore_index=True)
    pd.testing.assert_frame_equal(actual_df, expected_df)
Exemplo n.º 7
0
    def test_parse_windowedvalue_with_window_info(self):
        """Tests that WindowedValues are supported and have their own columns.
    """

        els = [windowed_value(('a', 2)), windowed_value(('b', 3))]

        actual_df = utils.elements_to_df(els, include_window_info=True)
        expected_df = pd.DataFrame(
            [['a', 2, int(1e6), els[0].windows, els[0].pane_info],
             ['b', 3, int(1e6), els[1].windows, els[1].pane_info]],
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])
        # check_like so that ordering of indices doesn't matter.
        pd.testing.assert_frame_equal(actual_df, expected_df, check_like=True)
Exemplo n.º 8
0
    def test_parse_windowedvalue(self):
        """Tests that WindowedValues are supported but not present.
    """
        from apache_beam.transforms.window import GlobalWindow

        els = [
            WindowedValue(('a', 2), 1, [GlobalWindow()]),
            WindowedValue(('b', 3), 1, [GlobalWindow()])
        ]

        actual_df = utils.elements_to_df(els, include_window_info=False)
        expected_df = pd.DataFrame([['a', 2], ['b', 3]], columns=[0, 1])
        pd.testing.assert_frame_equal(actual_df, expected_df)
Exemplo n.º 9
0
    def test_parse_windowedvalue_with_window_info(self):
        """Tests that WindowedValues are supported and have their own columns.
    """
        from apache_beam.transforms.window import GlobalWindow

        els = [
            WindowedValue(('a', 2), 1, [GlobalWindow()]),
            WindowedValue(('b', 3), 1, [GlobalWindow()])
        ]

        actual_df = utils.elements_to_df(els, include_window_info=True)
        expected_df = pd.DataFrame(
            [['a', 2, int(1e6), els[0].windows, els[0].pane_info],
             ['b', 3, int(1e6), els[1].windows, els[1].pane_info]],
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])
        pd.testing.assert_frame_equal(actual_df, expected_df)
Exemplo n.º 10
0
 def _to_dataframe(self):
     results = list(self._stream.read(tail=False))
     return elements_to_df(results,
                           self._include_window_info,
                           element_type=self._element_type)
Exemplo n.º 11
0
def collect(pcoll, n='inf', duration='inf', include_window_info=False):
    """Materializes the elements from a PCollection into a Dataframe.

  This reads each element from file and reads only the amount that it needs
  into memory. The user can specify either the max number of elements to read
  or the maximum duration of elements to read. When a limiter is not supplied,
  it is assumed to be infinite.

  Args:
    n: (optional) max number of elements to visualize. Default 'inf'.
    duration: (optional) max duration of elements to read in integer seconds or
        a string duration. Default 'inf'.
    include_window_info: (optional) if True, appends the windowing information
        to each row. Default False.

  For example::

    p = beam.Pipeline(InteractiveRunner())
    init = p | 'Init' >> beam.Create(range(10))
    square = init | 'Square' >> beam.Map(lambda x: x * x)

    # Run the pipeline and bring the PCollection into memory as a Dataframe.
    in_memory_square = head(square, n=5)
  """
    # Remember the element type so we can make an informed decision on how to
    # collect the result in elements_to_df.
    if isinstance(pcoll, DeferredBase):
        # Get the proxy so we can get the output shape of the DataFrame.
        # TODO(BEAM-11064): Once type hints are implemented for pandas, use those
        # instead of the proxy.
        element_type = pcoll._expr.proxy()
        pcoll = to_pcollection(pcoll,
                               yield_elements='pandas',
                               label=str(pcoll._expr))
        watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})
    else:
        element_type = pcoll.element_type

    assert isinstance(pcoll, beam.pvalue.PCollection), (
        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    if isinstance(n, str):
        assert n == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(n, int):
        assert n > 0, 'n needs to be positive or the string \'inf\''

    if isinstance(duration, int):
        assert duration > 0, (
            'duration needs to be positive, a duration string, '
            'or the string \'inf\'')

    if n == 'inf':
        n = float('inf')

    if duration == 'inf':
        duration = float('inf')

    user_pipeline = pcoll.pipeline
    recording_manager = ie.current_env().get_recording_manager(
        user_pipeline, create_if_absent=True)

    recording = recording_manager.record([pcoll],
                                         max_n=n,
                                         max_duration=duration)

    try:
        elements = list(recording.stream(pcoll).read())
    except KeyboardInterrupt:
        recording.cancel()
        return pd.DataFrame()

    if n == float('inf'):
        n = None

    # Collecting DataFrames may have a length > n, so slice again to be sure. Note
    # that array[:None] returns everything.
    return elements_to_df(elements,
                          include_window_info=include_window_info,
                          element_type=element_type)[:n]
Exemplo n.º 12
0
def head(pcoll, n=5, include_window_info=False):
    """Materializes the first n elements from a PCollection into a Dataframe.

  This reads each element from file and reads only the amount that it needs
  into memory.
  For example::

    p = beam.Pipeline(InteractiveRunner())
    init = p | 'Init' >> beam.Create(range(10))
    square = init | 'Square' >> beam.Map(lambda x: x * x)

    # Run the pipeline and bring the PCollection into memory as a Dataframe.
    in_memory_square = head(square, n=5)
  """
    assert isinstance(pcoll, beam.pvalue.PCollection), (
        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    user_pipeline = pcoll.pipeline
    runner = user_pipeline.runner
    if isinstance(runner, ir.InteractiveRunner):
        runner = runner._underlying_runner

    # Make sure that sources without a user reference are still cached.
    pi.watch_sources(user_pipeline)

    # Make sure that all PCollections to be shown are watched. If a PCollection
    # has not been watched, make up a variable name for that PCollection and watch
    # it. No validation is needed here because the watch logic can handle
    # arbitrary variables.
    watched_pcollections = set()
    for watching in ie.current_env().watching():
        for _, val in watching:
            if hasattr(val, '__class__') and isinstance(
                    val, beam.pvalue.PCollection):
                watched_pcollections.add(val)
    if pcoll not in watched_pcollections:
        watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})

    warnings.filterwarnings('ignore', category=DeprecationWarning)
    # Attempt to run background caching job since we have the reference to the
    # user-defined pipeline.
    bcj.attempt_to_run_background_caching_job(runner, user_pipeline,
                                              user_pipeline.options)

    if pcoll in ie.current_env().computed_pcollections:
        # Read from pcoll cache, then convert to DF
        pipeline_instrument = pi.PipelineInstrument(pcoll.pipeline)
        key = pipeline_instrument.cache_key(pcoll)
        cache_manager = ie.current_env().cache_manager()

        coder = cache_manager.load_pcoder('full', key)
        reader, _ = cache_manager.read('full', key)
        elements = to_element_list(reader, coder, include_window_info=True)
    else:

        # Build a pipeline fragment for the PCollections and run it.
        result = pf.PipelineFragment([pcoll], user_pipeline.options).run()
        ie.current_env().set_pipeline_result(user_pipeline, result)

        # Invoke wait_until_finish to ensure the blocking nature of this API without
        # relying on the run to be blocking.
        result.wait_until_finish()

        # If the pipeline execution is successful at this stage, mark the
        # computation completeness for the given PCollections so that when further
        # `show` invocation occurs, Interactive Beam wouldn't need to re-compute.
        if result.state is beam.runners.runner.PipelineState.DONE:
            ie.current_env().mark_pcollection_computed([pcoll])

        elements = result.read(pcoll, include_window_info=True)

    results = []
    for e in elements:
        results.append(e)
        if len(results) >= n and n > 0:
            break

    return elements_to_df(results, include_window_info=include_window_info)