Пример #1
0
 def try_split(self, fraction):
     # type: (...) -> Optional[Tuple[SplitResultPrimary, SplitResultResidual]]
     if self.threadsafe_restriction_tracker and self.current_windowed_value:
         # Temporary workaround for [BEAM-7473]: get current_watermark before
         # split, in case watermark gets advanced before getting split results.
         # In worst case, current_watermark is always stale, which is ok.
         if self.watermark_estimator:
             current_watermark = self.watermark_estimator.current_watermark(
             )
         else:
             current_watermark = None
         split = self.threadsafe_restriction_tracker.try_split(fraction)
         if split:
             primary, residual = split
             element = self.current_windowed_value.value
             restriction_provider = self.signature.get_restriction_provider(
             )
             primary_size = restriction_provider.restriction_size(
                 element, primary)
             residual_size = restriction_provider.restriction_size(
                 element, residual)
             primary_value = ((element, primary), primary_size)
             residual_value = ((element, residual), residual_size)
             return (SplitResultPrimary(
                 primary_value=self.current_windowed_value.with_value(
                     primary_value)),
                     SplitResultResidual(
                         residual_value=self.current_windowed_value.
                         with_value(residual_value),
                         current_watermark=current_watermark,
                         deferred_timestamp=None))
     return None
Пример #2
0
 def create_split_across_windows(self, primary_windows, residual_windows):
     primary = SplitResultPrimary(primary_value=WindowedValue((
         ('a', (OffsetRange(0, 100), self.watermark_estimator_state)),
         100), 57, primary_windows)) if primary_windows else None
     residual = SplitResultResidual(
         residual_value=WindowedValue(
             (('a', (OffsetRange(0, 100), self.watermark_estimator_state)),
              100), 57, residual_windows),
         current_watermark=None,
         deferred_timestamp=None) if residual_windows else None
     return primary, residual
Пример #3
0
 def create_split_in_window(self, offset_index, windows):
     return (
         SplitResultPrimary(primary_value=WindowedValue((('a', (
             OffsetRange(0, offset_index),
             self.watermark_estimator_state)), offset_index), 57, windows)),
         SplitResultResidual(
             residual_value=WindowedValue(
                 (('a', (OffsetRange(offset_index, 100),
                         self.watermark_estimator.get_estimator_state())),
                  100 - offset_index), 57, windows),
             current_watermark=self.watermark_estimator.current_watermark(),
             deferred_timestamp=None))
Пример #4
0
    def _invoke_process_per_window(
        self,
        windowed_value,  # type: WindowedValue
        additional_args,
        additional_kwargs,
    ):
        # type: (...) -> Optional[SplitResultResidual]
        if self.has_windowed_inputs:
            window, = windowed_value.windows
            side_inputs = [si[window] for si in self.side_inputs]
            side_inputs.extend(additional_args)
            args_for_process, kwargs_for_process = util.insert_values_in_args(
                self.args_for_process, self.kwargs_for_process, side_inputs)
        elif self.cache_globally_windowed_args:
            # Attempt to cache additional args if all inputs are globally
            # windowed inputs when processing the first element.
            self.cache_globally_windowed_args = False

            # Fill in sideInputs if they are globally windowed
            global_window = GlobalWindow()
            self.args_for_process, self.kwargs_for_process = (
                util.insert_values_in_args(
                    self.args_for_process, self.kwargs_for_process,
                    [si[global_window] for si in self.side_inputs]))
            args_for_process, kwargs_for_process = (self.args_for_process,
                                                    self.kwargs_for_process)
        else:
            args_for_process, kwargs_for_process = (self.args_for_process,
                                                    self.kwargs_for_process)

        # Extract key in the case of a stateful DoFn. Note that in the case of a
        # stateful DoFn, we set during __init__ self.has_windowed_inputs to be
        # True. Therefore, windows will be exploded coming into this method, and
        # we can rely on the window variable being set above.
        if self.user_state_context or self.is_key_param_required:
            try:
                key, unused_value = windowed_value.value
            except (TypeError, ValueError):
                raise ValueError((
                    'Input value to a stateful DoFn or KeyParam must be a KV tuple; '
                    'instead, got \'%s\'.') % (windowed_value.value, ))

        for i, p in self.placeholders:
            if core.DoFn.ElementParam == p:
                args_for_process[i] = windowed_value.value
            elif core.DoFn.KeyParam == p:
                args_for_process[i] = key
            elif core.DoFn.WindowParam == p:
                args_for_process[i] = window
            elif core.DoFn.TimestampParam == p:
                args_for_process[i] = windowed_value.timestamp
            elif core.DoFn.PaneInfoParam == p:
                args_for_process[i] = windowed_value.pane_info
            elif isinstance(p, core.DoFn.StateParam):
                assert self.user_state_context is not None
                args_for_process[i] = (self.user_state_context.get_state(
                    p.state_spec, key, window))
            elif isinstance(p, core.DoFn.TimerParam):
                assert self.user_state_context is not None
                args_for_process[i] = (self.user_state_context.get_timer(
                    p.timer_spec, key, window))
            elif core.DoFn.BundleFinalizerParam == p:
                args_for_process[i] = self.bundle_finalizer_param

        if additional_kwargs:
            if kwargs_for_process is None:
                kwargs_for_process = additional_kwargs
            else:
                for key in additional_kwargs:
                    kwargs_for_process[key] = additional_kwargs[key]

        if kwargs_for_process:
            self.output_processor.process_outputs(
                windowed_value,
                self.process_method(*args_for_process, **kwargs_for_process))
        else:
            self.output_processor.process_outputs(
                windowed_value, self.process_method(*args_for_process))

        if self.is_splittable:
            assert self.threadsafe_restriction_tracker is not None
            # TODO: Consider calling check_done right after SDF.Process() finishing.
            # In order to do this, we need to know that current invoking dofn is
            # ProcessSizedElementAndRestriction.
            self.threadsafe_restriction_tracker.check_done()
            deferred_status = self.threadsafe_restriction_tracker.deferred_status(
            )
            current_watermark = None
            if self.watermark_estimator:
                current_watermark = self.watermark_estimator.current_watermark(
                )
            if deferred_status:
                deferred_restriction, deferred_timestamp = deferred_status
                element = windowed_value.value
                size = self.signature.get_restriction_provider(
                ).restriction_size(element, deferred_restriction)
                residual_value = ((element, deferred_restriction), size)
                return SplitResultResidual(
                    residual_value=windowed_value.with_value(residual_value),
                    current_watermark=current_watermark,
                    deferred_timestamp=deferred_timestamp)
        return None