Exemplo n.º 1
0
class _NativeWriteEvaluator(_TransformEvaluator):
    """TransformEvaluator for _NativeWrite transform."""

    ELEMENTS_TAG = _ListStateTag('elements')

    def __init__(self, evaluation_context, applied_ptransform,
                 input_committed_bundle, side_inputs):
        assert not side_inputs
        super().__init__(evaluation_context, applied_ptransform,
                         input_committed_bundle, side_inputs)

        assert applied_ptransform.transform.sink
        self._sink = applied_ptransform.transform.sink

    @property
    def _is_final_bundle(self):
        return (self._execution_context.watermarks.input_watermark ==
                WatermarkManager.WATERMARK_POS_INF)

    @property
    def _has_already_produced_output(self):
        return (self._execution_context.watermarks.output_watermark ==
                WatermarkManager.WATERMARK_POS_INF)

    def start_bundle(self):
        self.global_state = self._step_context.get_keyed_state(None)

    def process_timer(self, timer_firing):
        # We do not need to emit a KeyedWorkItem to process_element().
        pass

    def process_element(self, element):
        self.global_state.add_state(None, _NativeWriteEvaluator.ELEMENTS_TAG,
                                    element)

    def finish_bundle(self):
        # finish_bundle will append incoming bundles in memory until all the bundles
        # carrying data is processed. This is done to produce only a single output
        # shard (some tests depends on this behavior). It is possible to have
        # incoming empty bundles after the output is produced, these bundles will be
        # ignored and would not generate additional output files.
        # TODO(altay): Do not wait until the last bundle to write in a single shard.
        if self._is_final_bundle:
            elements = self.global_state.get_state(
                None, _NativeWriteEvaluator.ELEMENTS_TAG)
            if self._has_already_produced_output:
                # Ignore empty bundles that arrive after the output is produced.
                assert elements == []
            else:
                self._sink.pipeline_options = self._evaluation_context.pipeline_options
                with self._sink.writer() as writer:
                    for v in elements:
                        writer.Write(v.value)
            hold = WatermarkManager.WATERMARK_POS_INF
        else:
            hold = WatermarkManager.WATERMARK_NEG_INF
            self.global_state.set_timer(None, '', TimeDomain.WATERMARK,
                                        WatermarkManager.WATERMARK_POS_INF)

        return TransformResult(self, [], [], None, {None: hold})
  def __init__(self, step_context, dofn, key_coder):
    self.step_context = step_context
    self.dofn = dofn
    self.key_coder = key_coder

    self.all_state_specs, self.all_timer_specs = (
        userstate.get_dofn_specs(dofn))
    self.state_tags = {}
    for state_spec in self.all_state_specs:
      state_key = 'user/%s' % state_spec.name
      if isinstance(state_spec, userstate.BagStateSpec):
        state_tag = _ListStateTag(state_key)
      elif isinstance(state_spec, userstate.CombiningValueStateSpec):
        state_tag = _ListStateTag(state_key)
      else:
        raise ValueError('Invalid state spec: %s' % state_spec)
      self.state_tags[state_spec] = state_tag

    self.cached_states = {}
    self.cached_timers = {}
Exemplo n.º 3
0
    def __init__(self, step_context, dofn, key_coder):
        self.step_context = step_context
        self.dofn = dofn
        self.key_coder = key_coder

        self.all_state_specs, self.all_timer_specs = (
            userstate.get_dofn_specs(dofn))
        self.state_tags = {}
        for state_spec in self.all_state_specs:
            state_key = 'user/%s' % state_spec.name
            if isinstance(state_spec, userstate.BagStateSpec):
                state_tag = _ListStateTag(state_key)
            elif isinstance(state_spec, userstate.CombiningValueStateSpec):
                state_tag = _ListStateTag(state_key)
            else:
                raise ValueError('Invalid state spec: %s' % state_spec)
            self.state_tags[state_spec] = state_tag

        self.cached_states = {}
        self.cached_timers = {}
Exemplo n.º 4
0
class _GroupByKeyOnlyEvaluator(_TransformEvaluator):
  """TransformEvaluator for _GroupByKeyOnly transform."""

  MAX_ELEMENT_PER_BUNDLE = None
  ELEMENTS_TAG = _ListStateTag('elements')
  COMPLETION_TAG = _CombiningValueStateTag('completed', any)

  def __init__(self, evaluation_context, applied_ptransform,
               input_committed_bundle, side_inputs, scoped_metrics_container):
    assert not side_inputs
    super(_GroupByKeyOnlyEvaluator, self).__init__(
        evaluation_context, applied_ptransform, input_committed_bundle,
        side_inputs, scoped_metrics_container)

  def _is_final_bundle(self):
    return (self._execution_context.watermarks.input_watermark
            == WatermarkManager.WATERMARK_POS_INF)

  def start_bundle(self):
    self.step_context = self._execution_context.get_step_context()
    self.global_state = self.step_context.get_keyed_state(None)

    assert len(self._outputs) == 1
    self.output_pcollection = list(self._outputs)[0]

    # The input type of a GroupByKey will be KV[Any, Any] or more specific.
    kv_type_hint = (
        self._applied_ptransform.transform.get_type_hints().input_types[0])
    self.key_coder = coders.registry.get_coder(kv_type_hint[0].tuple_types[0])

  def process_timer(self, timer_firing):
    # We do not need to emit a KeyedWorkItem to process_element().
    pass

  def process_element(self, element):
    assert not self.global_state.get_state(
        None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG)
    if (isinstance(element, WindowedValue)
        and isinstance(element.value, collections.Iterable)
        and len(element.value) == 2):
      k, v = element.value
      encoded_k = self.key_coder.encode(k)
      state = self.step_context.get_keyed_state(encoded_k)
      state.add_state(None, _GroupByKeyOnlyEvaluator.ELEMENTS_TAG, v)
    else:
      raise TypeCheckError('Input to _GroupByKeyOnly must be a PCollection of '
                           'windowed key-value pairs. Instead received: %r.'
                           % element)

  def finish_bundle(self):
    if self._is_final_bundle():
      if self.global_state.get_state(
          None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG):
        # Ignore empty bundles after emitting output. (This may happen because
        # empty bundles do not affect input watermarks.)
        bundles = []
      else:
        gbk_result = []
        # TODO(ccy): perhaps we can clean this up to not use this
        # internal attribute of the DirectStepContext.
        for encoded_k in self.step_context.keyed_existing_state:
          # Ignore global state.
          if encoded_k is None:
            continue
          k = self.key_coder.decode(encoded_k)
          state = self.step_context.get_keyed_state(encoded_k)
          vs = state.get_state(None, _GroupByKeyOnlyEvaluator.ELEMENTS_TAG)
          gbk_result.append(GlobalWindows.windowed_value((k, vs)))

        def len_element_fn(element):
          _, v = element.value
          return len(v)

        bundles = self._split_list_into_bundles(
            self.output_pcollection, gbk_result,
            _GroupByKeyOnlyEvaluator.MAX_ELEMENT_PER_BUNDLE, len_element_fn)

      self.global_state.add_state(
          None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG, True)
      hold = WatermarkManager.WATERMARK_POS_INF
    else:
      bundles = []
      hold = WatermarkManager.WATERMARK_NEG_INF
      self.global_state.set_timer(
          None, '', TimeDomain.WATERMARK, WatermarkManager.WATERMARK_POS_INF)

    return TransformResult(
        self._applied_ptransform, bundles, [], None, {None: hold})