def finish_bundle(self): if self.writer is not None: yield WindowedValue(self.writer.close(), window.GlobalWindow().max_timestamp(), [window.GlobalWindow()])
def windowed_value(cls, value, timestamp=MIN_TIMESTAMP): return WindowedValue(value, timestamp, (GlobalWindow(), ))
def invoke_user_timer(self, timer_spec, key, window, timestamp): # self.output_processor is Optional, but in practice it won't be None here self.output_processor.process_outputs( # type: ignore[union-attr] WindowedValue(None, timestamp, (window, )), self.signature.timer_methods[timer_spec].invoke_timer_callback( self.user_state_context, key, window, timestamp))
def finish_bundle(self): for (k, w), va in self._cache.items(): # We compact the accumulator since a GBK (which necessitates encoding) # will follow. yield WindowedValue((k, self._combine_fn.compact(va)), w.end, (w, ))
def test_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words p = beam.Pipeline(runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) # Count the occurrences of each word. counts = (p | beam.Create(['to be or not to be that is the question']) | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that counts will be cached. ib.watch(locals()) result = p.run() result.wait_until_finish() actual = list(result.get(counts)) self.assertSetEqual( set(actual), set([ ('or', 1), ('that', 1), ('be', 2), ('is', 1), ('question', 1), ('to', 2), ('the', 1), ('not', 1), ])) # Truncate the precision to millis because the window coder uses millis # as units then gets upcast to micros. end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000 df_counts = ib.collect(counts, include_window_info=True) df_expected = pd.DataFrame( { 0: [e[0] for e in actual], 1: [e[1] for e in actual], 'event_time': [end_of_window for _ in actual], 'windows': [[GlobalWindow()] for _ in actual], 'pane_info': [ PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0) for _ in actual ] }, columns=[0, 1, 'event_time', 'windows', 'pane_info']) pd.testing.assert_frame_equal(df_expected, df_counts) actual_reified = result.get(counts, include_window_info=True) expected_reified = [ WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) for e in actual ] self.assertEqual(actual_reified, expected_reified)
def process(self, element): yield WindowedValue(element, expected_timestamp, [expected_window])
def as_windowed_value(element): return WindowedValueHolder(WindowedValue(element, 0, []))
def process(self, element, timestamp=beam.DoFn.TimestampParam, window=beam.DoFn.WindowParam, *args, **kwargs): if isinstance(element, KeyedWorkItem): # Must be a timer firing. key = element.encoded_key else: key, values = element values = list(values) assert len(values) == 1 # Value here will either be a WindowedValue or an ElementAndRestriction # object. # TODO: handle key collisions here. assert len(values) == 1, 'Internal error. Processing of splittable ' \ 'DoFn cannot continue since elements did not ' \ 'have unique keys.' value = values[0] if len(values) != 1: raise ValueError('') state = self._step_context.get_keyed_state(key) element_state = state.get_state(window, self._element_tag) # Initially element_state is an empty list. is_seed_call = not element_state if not is_seed_call: element = state.get_state(window, self._element_tag) restriction = state.get_state(window, self._restriction_tag) windowed_element = WindowedValue(element, timestamp, [window]) else: # After values iterator is expanded above we should have gotten a list # with a single ElementAndRestriction object. assert isinstance(value, ElementAndRestriction) element_and_restriction = value element = element_and_restriction.element restriction = element_and_restriction.restriction if isinstance(value, WindowedValue): windowed_element = WindowedValue(element, value.timestamp, value.windows) else: windowed_element = WindowedValue(element, timestamp, [window]) tracker = self.sdf_invoker.invoke_create_tracker(restriction) assert self._process_element_invoker assert isinstance(self._process_element_invoker, SDFProcessElementInvoker) output_values = self._process_element_invoker.invoke_process_element( self.sdf_invoker, windowed_element, tracker) sdf_result = None for output in output_values: if isinstance(output, SDFProcessElementInvoker.Result): # SDFProcessElementInvoker.Result should be the last item yielded. sdf_result = output break yield output assert sdf_result, ( 'SDFProcessElementInvoker must return a ' 'SDFProcessElementInvoker.Result object as the last ' 'value of a SDF invoke_process_element() invocation.') if not sdf_result.residual_restriction: # All work for current residual and restriction pair is complete. state.clear_state(window, self._element_tag) state.clear_state(window, self._restriction_tag) # Releasing output watermark by setting it to positive infinity. state.add_state(window, self.watermark_hold_tag, WatermarkManager.WATERMARK_POS_INF) else: state.add_state(window, self._element_tag, element) state.add_state(window, self._restriction_tag, sdf_result.residual_restriction) # Holding output watermark by setting it to negative infinity. state.add_state(window, self.watermark_hold_tag, WatermarkManager.WATERMARK_NEG_INF) # Setting a timer to be reinvoked to continue processing the element. # Currently Python SDK only supports setting timers based on watermark. So # forcing a reinvocation by setting a timer for watermark negative # infinity. # TODO(chamikara): update this by setting a timer for the proper # processing time when Python SDK supports that. state.set_timer(window, '', TimeDomain.WATERMARK, WatermarkManager.WATERMARK_NEG_INF)
def finish_bundle(self, context=None): from apache_beam.transforms import window from apache_beam.utils.windowed_value import WindowedValue if len(self._cached) > 0: yield WindowedValue(self._cached, -1, [window.GlobalWindow()])
def finish_bundle(self): for (k, w), va in self._cache.items(): yield WindowedValue((k, va), w.end, (w, ))
def invoke_process(self, windowed_value, restriction_tracker=None, output_processor=None, additional_args=None, additional_kwargs=None): if not additional_args: additional_args = [] if not additional_kwargs: additional_kwargs = {} if not output_processor: output_processor = self.output_processor self.context.set_element(windowed_value) # Call for the process function for each window if has windowed side inputs # or if the process accesses the window parameter. We can just call it once # otherwise as none of the arguments are changing if self.is_splittable and not restriction_tracker: restriction = self.invoke_initial_restriction(windowed_value.value) restriction_tracker = self.invoke_create_tracker(restriction) if restriction_tracker: if len(windowed_value.windows) > 1 and self.has_windowed_inputs: # Should never get here due to window explosion in # the upstream pair-with-restriction. raise NotImplementedError( 'SDFs in multiply-windowed values with windowed arguments.' ) restriction_tracker_param = ( self.signature.process_method.restriction_provider_arg_name) if not restriction_tracker_param: raise ValueError( 'A RestrictionTracker %r was provided but DoFn does not have a ' 'RestrictionTrackerParam defined' % restriction_tracker) from apache_beam.io import iobase self.threadsafe_restriction_tracker = iobase.ThreadsafeRestrictionTracker( restriction_tracker) additional_kwargs[restriction_tracker_param] = ( iobase.RestrictionTrackerView( self.threadsafe_restriction_tracker)) if self.watermark_estimator: # The watermark estimator needs to be reset for every element. self.watermark_estimator.reset() additional_kwargs[self.watermark_estimator_param] = ( self.watermark_estimator) try: self.current_windowed_value = windowed_value return self._invoke_process_per_window(windowed_value, additional_args, additional_kwargs, output_processor) finally: self.threadsafe_restriction_tracker = None self.current_windowed_value = windowed_value elif self.has_windowed_inputs and len(windowed_value.windows) != 1: for w in windowed_value.windows: self._invoke_process_per_window( WindowedValue(windowed_value.value, windowed_value.timestamp, (w, )), additional_args, additional_kwargs, output_processor) else: self._invoke_process_per_window(windowed_value, additional_args, additional_kwargs, output_processor)
def test_invoker_normal(init_beam, fn): invoker = init_beam(fn) print("Normal testing {} with {} invoker.".format(fn, invoker)) windowed_value = WindowedValue(False, 0, [None]) invoker.invoke_process(windowed_value)
def finish_bundle(self): xml = etree.tostring(self._root, pretty_print=True) self._root = None yield WindowedValue(xml, GlobalWindow().max_timestamp(), [GlobalWindow()])
def invoke_user_timer(self, timer_spec, key, window, timestamp): self.output_processor.process_outputs( WindowedValue(None, timestamp, (window, )), self.signature.timer_methods[timer_spec].invoke_timer_callback( self.user_state_context, key, window, timestamp))
def finish_bundle(self, context=None): if len(self._cached) > 0: # pylint: disable=g-explicit-length-test yield WindowedValue(self._cached, -1, [window.GlobalWindow()])
def finish_bundle(self): if self.writer is not None: yield WindowedValue(self.writer.close(), window.MAX_TIMESTAMP, [window.GlobalWindow()])
def windowed_value(e): from apache_beam.transforms.window import GlobalWindow return WindowedValue(e, 1, [GlobalWindow()])
def finish_bundle(self, context=None): if self._batch: yield WindowedValue(self._flush_batch(), -1, [window.GlobalWindow()])
def finish_bundle(self, element=None): from apache_beam.transforms import window from apache_beam.utils.windowed_value import WindowedValue if len(self._cached) > 0: # pylint: disable=g-explicit-length-test yield WindowedValue(self._cached, -1, [window.GlobalWindow()])
def output_key(self, wkey, value): windows, key = wkey if windows is 0: self.output(_globally_windowed_value.with_value((key, value))) else: self.output(WindowedValue((key, value), windows[0].end, windows))
def finish_bundle(self, *args, **kwargs): if self._batch: yield WindowedValue(self._batch, -1, [window.GlobalWindow()])
def windowed_value(cls, value, timestamp=MIN_TIMESTAMP, pane_info=windowed_value.PANE_INFO_UNKNOWN): return WindowedValue(value, timestamp, (GlobalWindow(), ), pane_info)
def invoke_process(self, windowed_value, # type: WindowedValue restriction_tracker=None, # type: Optional[RestrictionTracker] watermark_estimator=None, # type: Optional[WatermarkEstimator] additional_args=None, additional_kwargs=None ): # type: (...) -> Optional[SplitResultResidual] if not additional_args: additional_args = [] if not additional_kwargs: additional_kwargs = {} self.context.set_element(windowed_value) # Call for the process function for each window if has windowed side inputs # or if the process accesses the window parameter. We can just call it once # otherwise as none of the arguments are changing if self.is_splittable and not restriction_tracker: restriction = self.invoke_initial_restriction(windowed_value.value) restriction_tracker = self.invoke_create_tracker(restriction) if restriction_tracker is not None: if len(windowed_value.windows) > 1 and self.has_windowed_inputs: # Should never get here due to window explosion in # the upstream pair-with-restriction. raise NotImplementedError( 'SDFs in multiply-windowed values with windowed arguments.') restriction_tracker_param = ( self.signature.process_method.restriction_provider_arg_name) if not restriction_tracker_param: raise ValueError( 'A RestrictionTracker %r was provided but DoFn does not have a ' 'RestrictionTrackerParam defined' % restriction_tracker) self.threadsafe_restriction_tracker = ThreadsafeRestrictionTracker( restriction_tracker) additional_kwargs[restriction_tracker_param] = ( RestrictionTrackerView(self.threadsafe_restriction_tracker)) self.threadsafe_watermark_estimator = ( ThreadsafeWatermarkEstimator(watermark_estimator)) watermark_param = ( self.signature.process_method.watermark_estimator_provider_arg_name) # When the watermark_estimator is a NoOpWatermarkEstimator, the system # will not add watermark_param into the DoFn param list. if watermark_param is not None: additional_kwargs[watermark_param] = self.threadsafe_watermark_estimator try: self.current_windowed_value = windowed_value return self._invoke_process_per_window( windowed_value, additional_args, additional_kwargs) finally: self.threadsafe_restriction_tracker = None self.threadsafe_watermark_estimator = None self.current_windowed_value = windowed_value elif self.has_windowed_inputs and len(windowed_value.windows) != 1: for w in windowed_value.windows: self._invoke_process_per_window( WindowedValue( windowed_value.value, windowed_value.timestamp, (w, )), additional_args, additional_kwargs) else: self._invoke_process_per_window( windowed_value, additional_args, additional_kwargs) return None