def run_trigger_simple(self, window_fn, trigger_fn, accumulation_mode, timestamped_data, expected_panes, *groupings, **kwargs): late_data = kwargs.pop('late_data', []) assert not kwargs def bundle_data(data, size): bundle = [] for timestamp, elem in data: windows = window_fn.assign( WindowFn.AssignContext(timestamp, elem)) bundle.append(WindowedValue(elem, timestamp, windows)) if len(bundle) == size: yield bundle bundle = [] if bundle: yield bundle if not groupings: groupings = [1] for group_by in groupings: bundles = [] bundle = [] for timestamp, elem in timestamped_data: windows = window_fn.assign( WindowFn.AssignContext(timestamp, elem)) bundle.append(WindowedValue(elem, timestamp, windows)) if len(bundle) == group_by: bundles.append(bundle) bundle = [] bundles.append(bundle) self.run_trigger(window_fn, trigger_fn, accumulation_mode, bundle_data(timestamped_data, group_by), bundle_data(late_data, group_by), expected_panes)
def _process_outputs(self, element, results): """Dispatch the result of computation to the appropriate receivers. A value wrapped in a SideOutputValue object will be unwrapped and then dispatched to the appropriate indexed output. """ if results is None: return for result in results: tag = None if isinstance(result, SideOutputValue): tag = result.tag if not isinstance(tag, basestring): raise TypeError('In %s, tag %s is not a string' % (self, tag)) result = result.value if isinstance(result, WindowedValue): windowed_value = result elif element is None: # Start and finish have no element from which to grab context, # but may emit elements. if isinstance(result, TimestampedValue): value = result.value timestamp = result.timestamp assign_context = NoContext(value, timestamp) else: value = result timestamp = -1 assign_context = NoContext(value) windowed_value = WindowedValue( value, timestamp, self.window_fn.assign(assign_context)) elif isinstance(result, TimestampedValue): assign_context = WindowFn.AssignContext( result.timestamp, result.value, element.windows) windowed_value = WindowedValue( result.value, result.timestamp, self.window_fn.assign(assign_context)) else: windowed_value = element.with_value(result) if tag is None: self.main_receivers.output(windowed_value) else: self.tagged_receivers[tag].output(windowed_value)
def flush(self, target): limit = self.size - target for ix, (kw, vs) in enumerate(self.table.items()): if ix >= limit: break del self.table[kw] key, windows = kw output_value = [v.value[1] for v in vs] windowed_value = WindowedValue((key, output_value), vs[0].timestamp, windows) self.output(windowed_value)
def bundle_data(data, size): bundle = [] for timestamp, elem in data: windows = window_fn.assign( WindowFn.AssignContext(timestamp, elem)) bundle.append(WindowedValue(elem, timestamp, windows)) if len(bundle) == size: yield bundle bundle = [] if bundle: yield bundle
def _output(self, window, finished, state): """Output window and clean up if appropriate.""" values = state.get_state(window, self.ELEMENTS) if finished: # TODO(robertwb): allowed lateness state.clear_state(window, self.ELEMENTS) state.add_state(window, self.TOMBSTONE, 1) elif self.accumulation_mode == AccumulationMode.DISCARDING: state.clear_state(window, self.ELEMENTS) timestamp = state.get_state(window, self.WATERMARK_HOLD) if timestamp is None: # If no watermark hold was set, output at end of window. timestamp = window.end else: state.clear_state(window, self.WATERMARK_HOLD) return WindowedValue(values, timestamp, (window, ))
def process_elements(self, state, windowed_values, unused_output_watermark): if isinstance(windowed_values, list): unwindowed = [wv.value for wv in windowed_values] else: class UnwindowedValues(observable.ObservableMixin): def __iter__(self): for wv in windowed_values: unwindowed_value = wv.value self.notify_observers(unwindowed_value) yield unwindowed_value def __repr__(self): return '<UnwindowedValues of %s>' % windowed_values unwindowed = UnwindowedValues() yield WindowedValue(unwindowed, MIN_TIMESTAMP, self.GLOBAL_WINDOW_TUPLE)
def Write(self, windowed_kv): # WindmillWriter takes windowed values, reifies the windows and writes the # resulting windowed value to Windmill. Note that in this streaming case, # the service does not add a ReifyWindows step, so we do that here. key, value = windowed_kv.value timestamp = windowed_kv.timestamp wm_timestamp = harness_to_windmill_timestamp(timestamp) windows = windowed_kv.windows windowed_value = WindowedValue(value, timestamp, windows) encoded_key = self.key_coder.encode(key) encoded_value = self.wv_coder.encode(windowed_value) # TODO(ccy): In the future, we will populate metadata with PaneInfo # details. metadata = '' # Add to output for key. if encoded_key not in self.keyed_output: self.keyed_output[encoded_key] = ( self.windmill_pb2.KeyedMessageBundle(key=encoded_key)) self.keyed_output[encoded_key].messages.add( timestamp=wm_timestamp, data=encoded_value, metadata=metadata)
def output_key(self, wkey, value): windows, key = wkey self.output(WindowedValue((key, value), windows[0].end, windows))
def decode_from_stream(self, in_stream, nested): return WindowedValue( self._value_coder.decode_from_stream(in_stream, True), self._timestamp_coder.decode_from_stream(in_stream, True), self._windows_coder.decode_from_stream(in_stream, True))
def _run_log(self, spec): def parse_int_list(s): """Parses strings like '[1, 2, 3]'.""" s = s.strip() assert s[0] == '[' and s[-1] == ']', s if not s[1:-1].strip(): return [] else: return [int(x) for x in s[1:-1].split(',')] def split_args(s): """Splits 'a, b, [c, d]' into ['a', 'b', '[c, d]'].""" args = [] start = 0 depth = 0 for ix in xrange(len(s)): c = s[ix] if c in '({[': depth += 1 elif c in ')}]': depth -= 1 elif c == ',' and depth == 0: args.append(s[start:ix].strip()) start = ix + 1 assert depth == 0, s args.append(s[start:].strip()) return args def parse(s, names): """Parse (recursive) 'Foo(arg, kw=arg)' for Foo in the names dict.""" s = s.strip() if s in names: return names[s] elif s[0] == '[': return parse_int_list(s) elif '(' in s: assert s[-1] == ')', s callee = parse(s[:s.index('(')], names) posargs = [] kwargs = {} for arg in split_args(s[s.index('(') + 1:-1]): if '=' in arg: kw, value = arg.split('=', 1) kwargs[kw] = parse(value, names) else: posargs.append(parse(arg, names)) return callee(*posargs, **kwargs) else: try: return int(s) except ValueError: raise ValueError('Unknown function: %s' % s) def parse_fn(s, names): """Like parse(), but implicitly calls no-arg constructors.""" fn = parse(s, names) if isinstance(fn, type): return fn() else: return fn # pylint: disable=g-import-not-at-top from google.cloud.dataflow.transforms import window as window_module from google.cloud.dataflow.transforms import trigger as trigger_module # pylint: enable=g-import-not-at-top window_fn_names = dict(window_module.__dict__) window_fn_names.update({ 'CustomTimestampingFixedWindowsWindowFn': CustomTimestampingFixedWindowsWindowFn }) trigger_names = {'Default': DefaultTrigger} trigger_names.update(trigger_module.__dict__) window_fn = parse_fn(spec.get('window_fn', 'GlobalWindows'), window_fn_names) trigger_fn = parse_fn(spec.get('trigger_fn', 'Default'), trigger_names) accumulation_mode = getattr( AccumulationMode, spec.get('accumulation_mode', 'ACCUMULATING').upper()) output_time_fn = getattr( OutputTimeFn, spec.get('output_time_fn', 'OUTPUT_AT_EOW').upper()) allowed_lateness = float(spec.get('allowed_lateness', '-inf')) driver = GeneralTriggerDriver( Windowing(window_fn, trigger_fn, accumulation_mode, output_time_fn)) state = InMemoryUnmergedState() output = [] watermark = MIN_TIMESTAMP def fire_timers(): to_fire = state.get_and_clear_timers(watermark) while to_fire: for timer_window, (name, time_domain, t_timestamp) in to_fire: for wvalue in driver.process_timer(timer_window, name, time_domain, t_timestamp, state): window, = wvalue.windows output.append({ 'window': [window.start, window.end - 1], 'values': sorted(wvalue.value), 'timestamp': wvalue.timestamp }) to_fire = state.get_and_clear_timers(watermark) for line in spec['transcript']: action, params = line.items()[0] if action != 'expect': # Fail if we have output that was not expected in the transcript. self.assertEquals([], output, msg='Unexpected output: %s before %s' % (output, line)) if action == 'input': bundle = [ WindowedValue( t, t, window_fn.assign(WindowFn.AssignContext(t, t))) for t in params ] output = [{ 'window': [wvalue.windows[0].start, wvalue.windows[0].end - 1], 'values': sorted(wvalue.value), 'timestamp': wvalue.timestamp } for wvalue in driver.process_elements( state, bundle, watermark)] fire_timers() elif action == 'watermark': watermark = params fire_timers() elif action == 'expect': for expected_output in params: for candidate in output: if all(candidate[k] == expected_output[k] for k in candidate if k in expected_output): output.remove(candidate) break else: self.fail('Unmatched output %s in %s' % (expected_output, output)) elif action == 'state': # TODO(robertwb): Implement once we support allowed lateness. pass else: self.fail('Unknown action: ' + action) # Fail if we have output that was not expected in the transcript. self.assertEquals([], output, msg='Unexpected output: %s' % output)
def process(self, context): context = WindowFn.AssignContext(context.timestamp, element=context.element, existing_windows=context.windows) new_windows = self.windowing.windowfn.assign(context) yield WindowedValue(context.element, context.timestamp, new_windows)
def timestamped_key_values(self, pipeline, key, *timestamps): return (pipeline | Create('start', timestamps) | Map(lambda x: WindowedValue((key, x), x, [])))