class _NativeWriteEvaluator(_TransformEvaluator): """TransformEvaluator for _NativeWrite transform.""" ELEMENTS_TAG = _ListStateTag('elements') def __init__(self, evaluation_context, applied_ptransform, input_committed_bundle, side_inputs): assert not side_inputs super().__init__(evaluation_context, applied_ptransform, input_committed_bundle, side_inputs) assert applied_ptransform.transform.sink self._sink = applied_ptransform.transform.sink @property def _is_final_bundle(self): return (self._execution_context.watermarks.input_watermark == WatermarkManager.WATERMARK_POS_INF) @property def _has_already_produced_output(self): return (self._execution_context.watermarks.output_watermark == WatermarkManager.WATERMARK_POS_INF) def start_bundle(self): self.global_state = self._step_context.get_keyed_state(None) def process_timer(self, timer_firing): # We do not need to emit a KeyedWorkItem to process_element(). pass def process_element(self, element): self.global_state.add_state(None, _NativeWriteEvaluator.ELEMENTS_TAG, element) def finish_bundle(self): # finish_bundle will append incoming bundles in memory until all the bundles # carrying data is processed. This is done to produce only a single output # shard (some tests depends on this behavior). It is possible to have # incoming empty bundles after the output is produced, these bundles will be # ignored and would not generate additional output files. # TODO(altay): Do not wait until the last bundle to write in a single shard. if self._is_final_bundle: elements = self.global_state.get_state( None, _NativeWriteEvaluator.ELEMENTS_TAG) if self._has_already_produced_output: # Ignore empty bundles that arrive after the output is produced. assert elements == [] else: self._sink.pipeline_options = self._evaluation_context.pipeline_options with self._sink.writer() as writer: for v in elements: writer.Write(v.value) hold = WatermarkManager.WATERMARK_POS_INF else: hold = WatermarkManager.WATERMARK_NEG_INF self.global_state.set_timer(None, '', TimeDomain.WATERMARK, WatermarkManager.WATERMARK_POS_INF) return TransformResult(self, [], [], None, {None: hold})
def __init__(self, step_context, dofn, key_coder): self.step_context = step_context self.dofn = dofn self.key_coder = key_coder self.all_state_specs, self.all_timer_specs = ( userstate.get_dofn_specs(dofn)) self.state_tags = {} for state_spec in self.all_state_specs: state_key = 'user/%s' % state_spec.name if isinstance(state_spec, userstate.BagStateSpec): state_tag = _ListStateTag(state_key) elif isinstance(state_spec, userstate.CombiningValueStateSpec): state_tag = _ListStateTag(state_key) else: raise ValueError('Invalid state spec: %s' % state_spec) self.state_tags[state_spec] = state_tag self.cached_states = {} self.cached_timers = {}
def __init__(self, step_context, dofn, key_coder): self.step_context = step_context self.dofn = dofn self.key_coder = key_coder self.all_state_specs, self.all_timer_specs = ( userstate.get_dofn_specs(dofn)) self.state_tags = {} for state_spec in self.all_state_specs: state_key = 'user/%s' % state_spec.name if isinstance(state_spec, userstate.BagStateSpec): state_tag = _ListStateTag(state_key) elif isinstance(state_spec, userstate.CombiningValueStateSpec): state_tag = _ListStateTag(state_key) else: raise ValueError('Invalid state spec: %s' % state_spec) self.state_tags[state_spec] = state_tag self.cached_states = {} self.cached_timers = {}
class _GroupByKeyOnlyEvaluator(_TransformEvaluator): """TransformEvaluator for _GroupByKeyOnly transform.""" MAX_ELEMENT_PER_BUNDLE = None ELEMENTS_TAG = _ListStateTag('elements') COMPLETION_TAG = _CombiningValueStateTag('completed', any) def __init__(self, evaluation_context, applied_ptransform, input_committed_bundle, side_inputs, scoped_metrics_container): assert not side_inputs super(_GroupByKeyOnlyEvaluator, self).__init__( evaluation_context, applied_ptransform, input_committed_bundle, side_inputs, scoped_metrics_container) def _is_final_bundle(self): return (self._execution_context.watermarks.input_watermark == WatermarkManager.WATERMARK_POS_INF) def start_bundle(self): self.step_context = self._execution_context.get_step_context() self.global_state = self.step_context.get_keyed_state(None) assert len(self._outputs) == 1 self.output_pcollection = list(self._outputs)[0] # The input type of a GroupByKey will be KV[Any, Any] or more specific. kv_type_hint = ( self._applied_ptransform.transform.get_type_hints().input_types[0]) self.key_coder = coders.registry.get_coder(kv_type_hint[0].tuple_types[0]) def process_timer(self, timer_firing): # We do not need to emit a KeyedWorkItem to process_element(). pass def process_element(self, element): assert not self.global_state.get_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG) if (isinstance(element, WindowedValue) and isinstance(element.value, collections.Iterable) and len(element.value) == 2): k, v = element.value encoded_k = self.key_coder.encode(k) state = self.step_context.get_keyed_state(encoded_k) state.add_state(None, _GroupByKeyOnlyEvaluator.ELEMENTS_TAG, v) else: raise TypeCheckError('Input to _GroupByKeyOnly must be a PCollection of ' 'windowed key-value pairs. Instead received: %r.' % element) def finish_bundle(self): if self._is_final_bundle(): if self.global_state.get_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG): # Ignore empty bundles after emitting output. (This may happen because # empty bundles do not affect input watermarks.) bundles = [] else: gbk_result = [] # TODO(ccy): perhaps we can clean this up to not use this # internal attribute of the DirectStepContext. for encoded_k in self.step_context.keyed_existing_state: # Ignore global state. if encoded_k is None: continue k = self.key_coder.decode(encoded_k) state = self.step_context.get_keyed_state(encoded_k) vs = state.get_state(None, _GroupByKeyOnlyEvaluator.ELEMENTS_TAG) gbk_result.append(GlobalWindows.windowed_value((k, vs))) def len_element_fn(element): _, v = element.value return len(v) bundles = self._split_list_into_bundles( self.output_pcollection, gbk_result, _GroupByKeyOnlyEvaluator.MAX_ELEMENT_PER_BUNDLE, len_element_fn) self.global_state.add_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG, True) hold = WatermarkManager.WATERMARK_POS_INF else: bundles = [] hold = WatermarkManager.WATERMARK_NEG_INF self.global_state.set_timer( None, '', TimeDomain.WATERMARK, WatermarkManager.WATERMARK_POS_INF) return TransformResult( self._applied_ptransform, bundles, [], None, {None: hold})