示例#1
0
文件: utils_test.py 项目: mszb/beam
    def test_test_stream_payload_events(self):
        """Tests that the to_element_list can limit the count in a single bundle."""

        coder = coders.FastPrimitivesCoder()

        def reader():
            element_payload = [
                TestStreamPayload.TimestampedElement(
                    encoded_element=coder.encode(
                        WindowedValueHolder(WindowedValue(e, 0, []))),
                    timestamp=Timestamp.of(0).micros) for e in range(10)
            ]

            event = TestStreamPayload.Event(
                element_event=TestStreamPayload.Event.AddElements(
                    elements=element_payload))
            yield event

        # The reader creates 10 elements in a single TestStreamPayload but we limit
        # the number of elements read to 5 here. This tests that the to_element_list
        # can limit the number of elements in a single bundle.
        elements = utils.to_element_list(reader(),
                                         coder,
                                         include_window_info=False,
                                         n=5)
        self.assertSequenceEqual(list(elements), list(range(5)))
示例#2
0
文件: utils_test.py 项目: mszb/beam
    def test_element_limit_count(self):
        """Tests that the to_element_list can limit the count."""

        elements = utils.to_element_list(iter(range(10)),
                                         None,
                                         include_window_info=False,
                                         n=5)
        self.assertSequenceEqual(list(elements), list(range(5)))
示例#3
0
  def _to_dataframe(self):
    results = []
    cache_manager = ie.current_env().cache_manager()
    if cache_manager.exists('full', self._cache_key):
      coder = cache_manager.load_pcoder('full', self._cache_key)
      reader, _ = cache_manager.read('full', self._cache_key)
      results = list(to_element_list(reader, coder, include_window_info=True))

    return elements_to_df(results, self._include_window_info)
  def read(self, tail=True):
    # type: (boolean) -> Any

    """Reads the elements currently recorded."""

    # Get the cache manager and wait until the file exists.
    cache_manager = ie.current_env().get_cache_manager(self._pipeline)

    # Retrieve the coder for the particular PCollection which will be used to
    # decode elements read from cache.
    coder = cache_manager.load_pcoder('full', self._cache_key)

    # Read the elements from the cache.
    # Import limiters here to prevent a circular import.
    from apache_beam.runners.interactive.options.capture_limiters import CountLimiter
    from apache_beam.runners.interactive.options.capture_limiters import ProcessingTimeLimiter
    reader, _ = cache_manager.read('full', self._cache_key, tail=tail)

    # Because a single TestStreamFileRecord can yield multiple elements, we
    # limit the count again here in the to_element_list call.
    #
    # There are two ways of exiting this loop either a limiter was triggered or
    # all elements from the cache were read. In the latter situation, it may be
    # the case that the pipeline was still running. Thus, another invocation of
    # `read` will yield new elements.
    count_limiter = CountLimiter(self._n)
    time_limiter = ProcessingTimeLimiter(self._duration_secs)
    limiters = (count_limiter, time_limiter)
    for e in utils.to_element_list(reader,
                                   coder,
                                   include_window_info=True,
                                   n=self._n,
                                   include_time_events=True):

      # From the to_element_list we either get TestStreamPayload.Events if
      # include_time_events or decoded elements from the reader. Make sure we
      # only count the decoded elements to break early.
      if isinstance(e, TestStreamPayload.Event):
        time_limiter.update(e)
      else:
        count_limiter.update(e)
        yield e

      if any(l.is_triggered() for l in limiters):
        break

    # A limiter being triggered means that we have fulfilled the user's request.
    # This implies that reading from the cache again won't yield any new
    # elements. WLOG, this applies to the user pipeline being terminated.
    if any(l.is_triggered()
           for l in limiters) or ie.current_env().is_terminated(self._pipeline):
      self._done = True
示例#5
0
  def read(self, pcoll, include_window_info=False):
    """Reads the PCollection one element at a time from cache.

    If include_window_info is True, then returns the elements as
    WindowedValues. Otherwise, return the element as itself.
    """
    key = self._pipeline_instrument.cache_key(pcoll)
    cache_manager = ie.current_env().cache_manager()
    if cache_manager.exists('full', key):
      coder = cache_manager.load_pcoder('full', key)
      reader, _ = cache_manager.read('full', key)
      return to_element_list(reader, coder, include_window_info)
    else:
      raise ValueError('PCollection not available, please run the pipeline.')
示例#6
0
    def read(self, tail=True):
        # type: (boolean) -> Any
        """Reads the elements currently recorded."""

        # Get the cache manager and wait until the file exists.
        cache_manager = ie.current_env().get_cache_manager(self._pipeline)

        # Retrieve the coder for the particular PCollection which will be used to
        # decode elements read from cache.
        coder = cache_manager.load_pcoder('full', self._cache_key)

        # Read the elements from the cache.
        limiters = [
            CountLimiter(self._n),
            ProcessingTimeLimiter(self._duration_secs)
        ]
        reader, _ = cache_manager.read('full', self._cache_key, tail=tail)

        # Because a single TestStreamFileRecord can yield multiple elements, we
        # limit the count again here in the to_element_list call.
        #
        # There are two ways of exiting this loop either a limiter was triggered or
        # all elements from the cache were read. In the latter situation, it may be
        # the case that the pipeline was still running. Thus, another invocation of
        # `read` will yield new elements.
        for e in utils.to_element_list(reader,
                                       coder,
                                       include_window_info=True,
                                       n=self._n):
            for l in limiters:
                l.update(e)

            yield e

            if any(l.is_triggered() for l in limiters):
                break

        # A limiter being triggered means that we have fulfilled the user's request.
        # This implies that reading from the cache again won't yield any new
        # elements. WLOG, this applies to the user pipeline being terminated.
        if any(l.is_triggered()
               for l in limiters) or ie.current_env().is_terminated(
                   self._pipeline):
            self._done = True
示例#7
0
def head(pcoll, n=5, include_window_info=False):
    """Materializes the first n elements from a PCollection into a Dataframe.

  This reads each element from file and reads only the amount that it needs
  into memory.
  For example::

    p = beam.Pipeline(InteractiveRunner())
    init = p | 'Init' >> beam.Create(range(10))
    square = init | 'Square' >> beam.Map(lambda x: x * x)

    # Run the pipeline and bring the PCollection into memory as a Dataframe.
    in_memory_square = head(square, n=5)
  """
    assert isinstance(pcoll, beam.pvalue.PCollection), (
        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    user_pipeline = pcoll.pipeline
    runner = user_pipeline.runner
    if isinstance(runner, ir.InteractiveRunner):
        runner = runner._underlying_runner

    # Make sure that sources without a user reference are still cached.
    pi.watch_sources(user_pipeline)

    # Make sure that all PCollections to be shown are watched. If a PCollection
    # has not been watched, make up a variable name for that PCollection and watch
    # it. No validation is needed here because the watch logic can handle
    # arbitrary variables.
    watched_pcollections = set()
    for watching in ie.current_env().watching():
        for _, val in watching:
            if hasattr(val, '__class__') and isinstance(
                    val, beam.pvalue.PCollection):
                watched_pcollections.add(val)
    if pcoll not in watched_pcollections:
        watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})

    warnings.filterwarnings('ignore', category=DeprecationWarning)
    # Attempt to run background caching job since we have the reference to the
    # user-defined pipeline.
    bcj.attempt_to_run_background_caching_job(runner, user_pipeline,
                                              user_pipeline.options)

    if pcoll in ie.current_env().computed_pcollections:
        # Read from pcoll cache, then convert to DF
        pipeline_instrument = pi.PipelineInstrument(pcoll.pipeline)
        key = pipeline_instrument.cache_key(pcoll)
        cache_manager = ie.current_env().cache_manager()

        coder = cache_manager.load_pcoder('full', key)
        reader, _ = cache_manager.read('full', key)
        elements = to_element_list(reader, coder, include_window_info=True)
    else:

        # Build a pipeline fragment for the PCollections and run it.
        result = pf.PipelineFragment([pcoll], user_pipeline.options).run()
        ie.current_env().set_pipeline_result(user_pipeline, result)

        # Invoke wait_until_finish to ensure the blocking nature of this API without
        # relying on the run to be blocking.
        result.wait_until_finish()

        # If the pipeline execution is successful at this stage, mark the
        # computation completeness for the given PCollections so that when further
        # `show` invocation occurs, Interactive Beam wouldn't need to re-compute.
        if result.state is beam.runners.runner.PipelineState.DONE:
            ie.current_env().mark_pcollection_computed([pcoll])

        elements = result.read(pcoll, include_window_info=True)

    results = []
    for e in elements:
        results.append(e)
        if len(results) >= n and n > 0:
            break

    return elements_to_df(results, include_window_info=include_window_info)