def test_global_window(self): self.assertEqual(GlobalWindow(), GlobalWindow()) self.assertNotEqual(GlobalWindow(), IntervalWindow(MIN_TIMESTAMP, MAX_TIMESTAMP)) self.assertNotEqual(IntervalWindow(MIN_TIMESTAMP, MAX_TIMESTAMP), GlobalWindow()) self.assertTrue(GlobalWindow().max_timestamp() < MAX_TIMESTAMP)
def test_windowedvalue_coder_paneinfo(self): coder = coders.WindowedValueCoder(coders.VarIntCoder(), coders.GlobalWindowCoder()) test_paneinfo_values = [ windowed_value.PANE_INFO_UNKNOWN, windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.EARLY, 0, -1), windowed_value.PaneInfo( True, False, windowed_value.PaneInfoTiming.ON_TIME, 0, 0), windowed_value.PaneInfo( True, False, windowed_value.PaneInfoTiming.ON_TIME, 10, 0), windowed_value.PaneInfo( False, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 23), windowed_value.PaneInfo( False, True, windowed_value.PaneInfoTiming.ON_TIME, 12, 23), windowed_value.PaneInfo( False, False, windowed_value.PaneInfoTiming.LATE, 0, 123),] test_values = [windowed_value.WindowedValue(123, 234, (GlobalWindow(),), p) for p in test_paneinfo_values] # Test unnested. self.check_coder(coder, windowed_value.WindowedValue( 123, 234, (GlobalWindow(),), windowed_value.PANE_INFO_UNKNOWN)) for value in test_values: self.check_coder(coder, value) # Test nested. for value1 in test_values: for value2 in test_values: self.check_coder(coders.TupleCoder((coder, coder)), (value1, value2))
def test_parse_windowedvalue_with_dicts(self): """Tests that dicts play well with WindowedValues. """ from apache_beam.transforms.window import GlobalWindow els = [ WindowedValue({ 'b': 2, 'd': 4 }, 1, [GlobalWindow()]), WindowedValue({ 'a': 1, 'b': 2, 'c': 3 }, 1, [GlobalWindow()]) ] actual_df = utils.elements_to_df(els, include_window_info=True) expected_df = pd.DataFrame( [[ np.nan, 2, np.nan, 4, int(1e6), els[0].windows, els[0].pane_info ], [1, 2, 3, np.nan, int(1e6), els[1].windows, els[1].pane_info]], columns=['a', 'b', 'c', 'd', 'event_time', 'windows', 'pane_info']) pd.testing.assert_frame_equal(actual_df, expected_df)
def test_shard_naming(self): namer = fileio.default_file_naming(prefix='/path/to/file', suffix='.txt') self.assertEqual(namer(GlobalWindow(), None, None, None, None, None), '/path/to/file.txt') self.assertEqual(namer(GlobalWindow(), None, 1, 5, None, None), '/path/to/file-00001-of-00005.txt') self.assertEqual(namer(GlobalWindow(), None, 1, 5, 'gz', None), '/path/to/file-00001-of-00005.txt.gz') self.assertEqual( namer(IntervalWindow(0, 100), None, 1, 5, None, None), '/path/to/file' '-1970-01-01T00:00:00-1970-01-01T00:01:40-00001-of-00005.txt')
def test_parse_windowedvalue(self): """Tests that WindowedValues are supported but not present. """ from apache_beam.transforms.window import GlobalWindow els = [ WindowedValue(('a', 2), 1, [GlobalWindow()]), WindowedValue(('b', 3), 1, [GlobalWindow()]) ] actual_df = utils.elements_to_df(els, include_window_info=False) expected_df = pd.DataFrame([['a', 2], ['b', 3]], columns=[0, 1]) pd.testing.assert_frame_equal(actual_df, expected_df)
def test_timestamp_in_value(self): l = [TimestampedValue(('a', 1), 100), TimestampedValue(('b', 2), 200), TimestampedValue(('c', 3), 300)] expected = [TestWindowedValue(('a', TimestampedValue(1, 100)), 100, [GlobalWindow()]), TestWindowedValue(('b', TimestampedValue(2, 200)), 200, [GlobalWindow()]), TestWindowedValue(('c', TimestampedValue(3, 300)), 300, [GlobalWindow()])] with TestPipeline() as p: pc = p | beam.Create(l) | beam.Map(lambda x: x) reified_pc = pc | util.Reify.TimestampInValue() assert_that(reified_pc, equal_to(expected), reify_windows=True)
def test_basic_wordcount(self): """A wordcount to be used as a smoke test.""" # Create the pipeline that will emit 0, 1, 2. p = beam.Pipeline(InteractiveRunner()) elems = p | beam.Create([0, 1, 2]) # Watch the pipeline and PCollections. This is normally done in a notebook # environment automatically, but we have to do it manually here. ib.watch(locals()) ie.current_env().track_user_pipelines() # Create the recording objects. By calling `record` a new PipelineFragment # is started to compute the given PCollections and cache to disk. rm = RecordingManager(p) recording = rm.record([elems], max_n=3, max_duration_secs=500) stream = recording.stream(elems) recording.wait_until_finish() # Once the pipeline fragment completes, we can read from the stream and know # that all elements were written to cache. elems = list(stream.read()) expected_elems = [ WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3) ] self.assertListEqual(elems, expected_elems)
def test_reified_value_assert_fail_unmatched_timestamp(self): expected = [TestWindowedValue(v, 1, [GlobalWindow()]) for v in [1, 2, 3]] with self.assertRaises(Exception): with TestPipeline() as p: assert_that(p | Create([2, 3, 1]), equal_to(expected), reify_windows=True)
def test_parse_windowedvalue_with_window_info(self): """Tests that WindowedValues are supported and have their own columns. """ from apache_beam.transforms.window import GlobalWindow els = [ WindowedValue(('a', 2), 1, [GlobalWindow()]), WindowedValue(('b', 3), 1, [GlobalWindow()]) ] actual_df = utils.elements_to_df(els, include_window_info=True) expected_df = pd.DataFrame( [['a', 2, int(1e6), els[0].windows, els[0].pane_info], ['b', 3, int(1e6), els[1].windows, els[1].pane_info]], columns=[0, 1, 'event_time', 'windows', 'pane_info']) pd.testing.assert_frame_equal(actual_df, expected_df)
class BatchGlobalTriggerDriver(TriggerDriver): """Groups all received values together. """ GLOBAL_WINDOW_TUPLE = (GlobalWindow(), ) ONLY_FIRING = windowed_value.PaneInfo( is_first=True, is_last=True, timing=windowed_value.PaneInfoTiming.ON_TIME, index=0, nonspeculative_index=0) def process_elements( self, state, windowed_values, unused_output_watermark, unused_input_watermark=MIN_TIMESTAMP): yield WindowedValue( _UnwindowedValues(windowed_values), MIN_TIMESTAMP, self.GLOBAL_WINDOW_TUPLE, self.ONLY_FIRING) def process_timer( self, window_id, name, time_domain, timestamp, state, input_watermark=None): raise TypeError('Triggers never set or called for batch default windowing.')
def _inner(window, pane, shard_index, total_shards, compression, destination): kwargs = {'prefix': prefix, 'start': '', 'end': '', 'pane': '', 'shard': 0, 'total_shards': 0, 'suffix': '', 'compression': ''} if total_shards is not None and shard_index is not None: kwargs['shard'] = int(shard_index) kwargs['total_shards'] = int(total_shards) if window != GlobalWindow(): kwargs['start'] = window.start.to_utc_datetime().isoformat() kwargs['end'] = window.end.to_utc_datetime().isoformat() # TODO(pabloem): Add support for PaneInfo # If the PANE is the ONLY firing in the window, we don't add it. #if pane and not (pane.is_first and pane.is_last): # kwargs['pane'] = pane.index if compression: kwargs['compression'] = '.%s' % compression if suffix: kwargs['suffix'] = suffix return _DEFAULT_FILE_NAME_TEMPLATE.format(**kwargs)
def finish_bundle(self): from apache_beam.utils import timestamp from apache_beam.transforms.window import WindowedValue, GlobalWindow if len(self._buffer) != 0: logging.info("Final Buffer Length: {}".format(len(self._buffer))) yield WindowedValue(self._buffer, timestamp.MIN_TIMESTAMP, [GlobalWindow()]) self._buffer = []
def __init__(self, keyed_state_backend): self._keyed_state_backend = keyed_state_backend self._current_watermark = None self._timer_coder_impl = None self._output_stream = None from apache_beam.transforms.window import GlobalWindow self._global_window = GlobalWindow()
def test_timestamp(self): l = [TimestampedValue('a', 100), TimestampedValue('b', 200), TimestampedValue('c', 300)] expected = [TestWindowedValue('a', 100, [GlobalWindow()]), TestWindowedValue('b', 200, [GlobalWindow()]), TestWindowedValue('c', 300, [GlobalWindow()])] with TestPipeline() as p: # Map(lambda x: x) PTransform is added after Create here, because when # a PCollection of TimestampedValues is created with Create PTransform, # the timestamps are not assigned to it. Adding a Map forces the # PCollection to go through a DoFn so that the PCollection consists of # the elements with timestamps assigned to them instead of a PCollection # of TimestampedValue(element, timestamp). pc = p | beam.Create(l) | beam.Map(lambda x: x) reified_pc = pc | util.Reify.Timestamp() assert_that(reified_pc, equal_to(expected), reify_windows=True)
def __init__(self): self.window = GlobalWindow() self.batch = [] self.engagement_range = 10 self.from_ts = 0 self.to_ts = 0 self.giap_es_username = '' self.giap_es_password = '' self.giap_es_index = ''
def test_reified_value_passes(self): expected = [ TestWindowedValue(v, MIN_TIMESTAMP, [GlobalWindow()]) for v in [1, 2, 3] ] with TestPipeline() as p: assert_that(p | Create([2, 3, 1]), equal_to(expected), reify_windows=True)
def test_timer_coder(self): self.check_coder( coders._TimerCoder(coders.StrUtf8Coder(), coders.GlobalWindowCoder()), *[ userstate.Timer(user_key="key", dynamic_timer_tag="tag", windows=(GlobalWindow(), ), clear_bit=True, fire_timestamp=None, hold_timestamp=None, paneinfo=None), userstate.Timer(user_key="key", dynamic_timer_tag="tag", windows=(GlobalWindow(), ), clear_bit=False, fire_timestamp=timestamp.Timestamp.of(123), hold_timestamp=timestamp.Timestamp.of(456), paneinfo=windowed_value.PANE_INFO_UNKNOWN) ])
class DiscardingGlobalTriggerDriver(TriggerDriver): """Groups all received values together. """ GLOBAL_WINDOW_TUPLE = (GlobalWindow(), ) def process_elements(self, state, windowed_values, unused_output_watermark): yield WindowedValue(_UnwindowedValues(windowed_values), MIN_TIMESTAMP, self.GLOBAL_WINDOW_TUPLE) def process_timer(self, window_id, name, time_domain, timestamp, state): raise TypeError( 'Triggers never set or called for batch default windowing.')
def _invoke_per_window( self, windowed_value, additional_args, additional_kwargs, output_processor): if self.has_windowed_inputs: window, = windowed_value.windows side_inputs = [si[window] for si in self.side_inputs] side_inputs.extend(additional_args) args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, side_inputs) elif self.cache_globally_windowed_args: # Attempt to cache additional args if all inputs are globally # windowed inputs when processing the first element. self.cache_globally_windowed_args = False # Fill in sideInputs if they are globally windowed global_window = GlobalWindow() self.args_for_process, self.kwargs_for_process = ( util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, [si[global_window] for si in self.side_inputs])) args_for_process, kwargs_for_process = ( self.args_for_process, self.kwargs_for_process) else: args_for_process, kwargs_for_process = ( self.args_for_process, self.kwargs_for_process) # TODO(sourabhbajaj): Investigate why we can't use `is` instead of == for i, p in self.placeholders: if p == core.DoFn.ElementParam: args_for_process[i] = windowed_value.value elif p == core.DoFn.WindowParam: args_for_process[i] = window elif p == core.DoFn.TimestampParam: args_for_process[i] = windowed_value.timestamp if additional_kwargs: if kwargs_for_process is None: kwargs_for_process = additional_kwargs else: for key in additional_kwargs: kwargs_for_process[key] = additional_kwargs[key] if kwargs_for_process: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: output_processor.process_outputs( windowed_value, self.process_method(*args_for_process))
def test_find_orphaned_files(self): dir = self._new_tempdir() write_transform = beam.io.fileio.WriteToFiles(path=dir) def write_orphaned_file(temp_dir, writer_key): temp_dir_path = FileSystems.join(dir, temp_dir) file_prefix_dir = FileSystems.join(temp_dir_path, str(abs(hash(writer_key)))) file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4()) with FileSystems.create(file_name) as f: f.write(b'Hello y\'all') return file_name with TestPipeline() as p: _ = (p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | "Serialize" >> beam.Map(json.dumps) | write_transform) # Pre-create the temp directory. temp_dir_path = FileSystems.mkdirs( FileSystems.join(dir, write_transform._temp_directory.get())) write_orphaned_file(write_transform._temp_directory.get(), (None, GlobalWindow())) f2 = write_orphaned_file(write_transform._temp_directory.get(), ('other-dest', GlobalWindow())) temp_dir_path = FileSystems.join(dir, write_transform._temp_directory.get()) leftovers = FileSystems.match(['%s%s*' % (temp_dir_path, os.sep)]) found_files = [m.path for m in leftovers[0].metadata_list] self.assertListEqual(found_files, [f2])
class DefaultGlobalBatchTriggerDriver(TriggerDriver): """Breaks a bundles into window (pane)s according to the default triggering. """ GLOBAL_WINDOW_TUPLE = (GlobalWindow(), ) def __init__(self): pass def process_elements(self, state, windowed_values, unused_output_watermark): yield WindowedValue(_UnwindowedValues(windowed_values), MIN_TIMESTAMP, self.GLOBAL_WINDOW_TUPLE) def process_timer(self, window_id, name, time_domain, timestamp, state): raise TypeError( 'Triggers never set or called for batch default windowing.')
def test_reshuffle_timestamps_unchanged(self): pipeline = TestPipeline() timestamp = 5 data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)] expected_result = [TestWindowedValue(v, timestamp, [GlobalWindow()]) for v in data] before_reshuffle = (pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map( lambda v: beam.window.TimestampedValue(v, timestamp))) assert_that(before_reshuffle, equal_to(expected_result), label='before_reshuffle', reify_windows=True) after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that(after_reshuffle, equal_to(expected_result), label='after_reshuffle', reify_windows=True) pipeline.run()
def _format_shard( window, pane, shard_index, total_shards, compression, prefix, suffix): kwargs = { 'prefix': prefix, 'start': '', 'end': '', 'pane': '', 'shard': 0, 'total_shards': 0, 'suffix': '', 'compression': '' } if total_shards is not None and shard_index is not None: kwargs['shard'] = int(shard_index) kwargs['total_shards'] = int(total_shards) if window != GlobalWindow(): kwargs['start'] = window.start.to_utc_datetime().isoformat() kwargs['end'] = window.end.to_utc_datetime().isoformat() # TODO(BEAM-3759): Add support for PaneInfo # If the PANE is the ONLY firing in the window, we don't add it. #if pane and not (pane.is_first and pane.is_last): # kwargs['pane'] = pane.index if suffix: kwargs['suffix'] = suffix if compression: kwargs['compression'] = '.%s' % compression # Remove separators for unused template parts. format = _DEFAULT_FILE_NAME_TEMPLATE if shard_index is None: format = format.replace('-{shard:05d}', '') if total_shards is None: format = format.replace('-of-{total_shards:05d}', '') for name, value in kwargs.items(): if value in (None, ''): format = format.replace('-{%s}' % name, '') return format.format(**kwargs)
def partition(self, n): # type: (int) -> List[List[bytes]] """ It is used to partition _GroupingBuffer to N parts. Once it is partitioned, it would not be re-partitioned with diff N. Re-partition is not supported now. """ if not self._grouped_output: if self._windowing.is_default(): globally_window = GlobalWindows.windowed_value( None, timestamp=GlobalWindow().max_timestamp(), pane_info=windowed_value.PaneInfo( is_first=True, is_last=True, timing=windowed_value.PaneInfoTiming.ON_TIME, index=0, nonspeculative_index=0)).with_value windowed_key_values = lambda key, values: [ globally_window((key, values)) ] else: # TODO(pabloem, BEAM-7514): Trigger driver needs access to the clock # note that this only comes through if windowing is default - but what # about having multiple firings on the global window. # May need to revise. trigger_driver = trigger.create_trigger_driver( self._windowing, True) windowed_key_values = trigger_driver.process_entire_key coder_impl = self._post_grouped_coder.get_impl() key_coder_impl = self._key_coder.get_impl() self._grouped_output = [[] for _ in range(n)] output_stream_list = [create_OutputStream() for _ in range(n)] for idx, (encoded_key, windowed_values) in enumerate(self._table.items()): key = key_coder_impl.decode(encoded_key) for wkvs in windowed_key_values(key, windowed_values): coder_impl.encode_to_stream(wkvs, output_stream_list[idx % n], True) for ix, output_stream in enumerate(output_stream_list): self._grouped_output[ix] = [output_stream.get()] self._table.clear() return self._grouped_output
def test_windowed_value_coder(self): coder = coders.WindowedValueCoder( coders.VarIntCoder(), coders.GlobalWindowCoder()) # Verify cloud object representation self.assertEqual({ '@type': 'kind:windowed_value', 'is_wrapper': True, 'component_encodings': [ coders.VarIntCoder().as_cloud_object(), coders.GlobalWindowCoder().as_cloud_object(), ], }, coder.as_cloud_object()) # Test binary representation self.assertEqual( b'\x7f\xdf;dZ\x1c\xac\t\x00\x00\x00\x01\x0f\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test decoding large timestamp self.assertEqual( coder.decode(b'\x7f\xdf;dZ\x1c\xac\x08\x00\x00\x00\x01\x0f\x00'), windowed_value.create(0, MIN_TIMESTAMP.micros, (GlobalWindow(), ))) # Test unnested self.check_coder( coders.WindowedValueCoder(coders.VarIntCoder()), windowed_value.WindowedValue(3, -100, ()), windowed_value.WindowedValue(-1, 100, (1, 2, 3))) # Test Global Window self.check_coder( coders.WindowedValueCoder( coders.VarIntCoder(), coders.GlobalWindowCoder()), window.GlobalWindows.windowed_value(1)) # Test nested self.check_coder( coders.TupleCoder(( coders.WindowedValueCoder(coders.FloatCoder()), coders.WindowedValueCoder(coders.StrUtf8Coder()))), ( windowed_value.WindowedValue(1.5, 0, ()), windowed_value.WindowedValue("abc", 10, ('window', ))))
def test_window(self): l = [GlobalWindows.windowed_value('a', 100), GlobalWindows.windowed_value('b', 200), GlobalWindows.windowed_value('c', 300)] expected = [TestWindowedValue(('a', 100, GlobalWindow()), 100, [GlobalWindow()]), TestWindowedValue(('b', 200, GlobalWindow()), 200, [GlobalWindow()]), TestWindowedValue(('c', 300, GlobalWindow()), 300, [GlobalWindow()])] with TestPipeline() as p: pc = p | beam.Create(l) reified_pc = pc | util.Reify.Window() assert_that(reified_pc, equal_to(expected), reify_windows=True)
def test_basic_execution(self): """A basic pipeline to be used as a smoke test.""" # Create the pipeline that will emit 0, 1, 2. p = beam.Pipeline(InteractiveRunner()) numbers = p | 'numbers' >> beam.Create([0, 1, 2]) letters = p | 'letters' >> beam.Create(['a', 'b', 'c']) # Watch the pipeline and PCollections. This is normally done in a notebook # environment automatically, but we have to do it manually here. ib.watch(locals()) ie.current_env().track_user_pipelines() # Create the recording objects. By calling `record` a new PipelineFragment # is started to compute the given PCollections and cache to disk. rm = RecordingManager(p) numbers_recording = rm.record([numbers], max_n=3, max_duration_secs=500) numbers_stream = numbers_recording.stream(numbers) numbers_recording.wait_until_finish() # Once the pipeline fragment completes, we can read from the stream and know # that all elements were written to cache. elems = list(numbers_stream.read()) expected_elems = [ WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3) ] self.assertListEqual(elems, expected_elems) # Make an extra recording and test the description. letters_recording = rm.record([letters], max_n=3, max_duration_secs=500) letters_recording.wait_until_finish() self.assertEqual( rm.describe()['size'], numbers_recording.describe()['size'] + letters_recording.describe()['size']) rm.cancel()
def test_window_in_value(self): l = [GlobalWindows.windowed_value(('a', 1), 100), GlobalWindows.windowed_value(('b', 2), 200), GlobalWindows.windowed_value(('c', 3), 300)] expected = [TestWindowedValue(('a', (1, 100, GlobalWindow())), 100, [GlobalWindow()]), TestWindowedValue(('b', (2, 200, GlobalWindow())), 200, [GlobalWindow()]), TestWindowedValue(('c', (3, 300, GlobalWindow())), 300, [GlobalWindow()])] with TestPipeline() as p: # Map(lambda x: x) hack is used for the same reason here. # Also, this makes the typehint on Reify.WindowInValue work. pc = p | beam.Create(l) | beam.Map(lambda x: x) reified_pc = pc | util.Reify.WindowInValue() assert_that(reified_pc, equal_to(expected), reify_windows=True)
def process(self, element, batch=DoFn.StateParam(BATCH), batchSize=DoFn.StateParam(BATCH_SIZE), flushTimer=DoFn.TimerParam(FLUSH_TIMER), endOfTime=DoFn.TimerParam(EOW_TIMER)): from apache_beam.utils.timestamp import Timestamp, Duration from apache_beam.transforms.window import GlobalWindow currentSize = batchSize.read() if not currentSize: currentSize = 1 flushTimer.set(Timestamp.now() + Duration(micros=self.maxWaitTime * 1000)) endOfTime.set(GlobalWindow().max_timestamp()) else: currentSize += 1 batchSize.write(currentSize) batch.add(element[1]) if currentSize >= self.batchSize: return self.flush(batch, batchSize)
def _invoke_process_per_window( self, windowed_value, # type: WindowedValue additional_args, additional_kwargs, ): # type: (...) -> Optional[SplitResultResidual] if self.has_windowed_inputs: window, = windowed_value.windows side_inputs = [si[window] for si in self.side_inputs] side_inputs.extend(additional_args) args_for_process, kwargs_for_process = util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, side_inputs) elif self.cache_globally_windowed_args: # Attempt to cache additional args if all inputs are globally # windowed inputs when processing the first element. self.cache_globally_windowed_args = False # Fill in sideInputs if they are globally windowed global_window = GlobalWindow() self.args_for_process, self.kwargs_for_process = ( util.insert_values_in_args( self.args_for_process, self.kwargs_for_process, [si[global_window] for si in self.side_inputs])) args_for_process, kwargs_for_process = (self.args_for_process, self.kwargs_for_process) else: args_for_process, kwargs_for_process = (self.args_for_process, self.kwargs_for_process) # Extract key in the case of a stateful DoFn. Note that in the case of a # stateful DoFn, we set during __init__ self.has_windowed_inputs to be # True. Therefore, windows will be exploded coming into this method, and # we can rely on the window variable being set above. if self.user_state_context or self.is_key_param_required: try: key, unused_value = windowed_value.value except (TypeError, ValueError): raise ValueError(( 'Input value to a stateful DoFn or KeyParam must be a KV tuple; ' 'instead, got \'%s\'.') % (windowed_value.value, )) for i, p in self.placeholders: if core.DoFn.ElementParam == p: args_for_process[i] = windowed_value.value elif core.DoFn.KeyParam == p: args_for_process[i] = key elif core.DoFn.WindowParam == p: args_for_process[i] = window elif core.DoFn.TimestampParam == p: args_for_process[i] = windowed_value.timestamp elif core.DoFn.PaneInfoParam == p: args_for_process[i] = windowed_value.pane_info elif isinstance(p, core.DoFn.StateParam): assert self.user_state_context is not None args_for_process[i] = (self.user_state_context.get_state( p.state_spec, key, window)) elif isinstance(p, core.DoFn.TimerParam): assert self.user_state_context is not None args_for_process[i] = (self.user_state_context.get_timer( p.timer_spec, key, window)) elif core.DoFn.BundleFinalizerParam == p: args_for_process[i] = self.bundle_finalizer_param if additional_kwargs: if kwargs_for_process is None: kwargs_for_process = additional_kwargs else: for key in additional_kwargs: kwargs_for_process[key] = additional_kwargs[key] if kwargs_for_process: self.output_processor.process_outputs( windowed_value, self.process_method(*args_for_process, **kwargs_for_process)) else: self.output_processor.process_outputs( windowed_value, self.process_method(*args_for_process)) if self.is_splittable: assert self.threadsafe_restriction_tracker is not None # TODO: Consider calling check_done right after SDF.Process() finishing. # In order to do this, we need to know that current invoking dofn is # ProcessSizedElementAndRestriction. self.threadsafe_restriction_tracker.check_done() deferred_status = self.threadsafe_restriction_tracker.deferred_status( ) current_watermark = None if self.watermark_estimator: current_watermark = self.watermark_estimator.current_watermark( ) if deferred_status: deferred_restriction, deferred_timestamp = deferred_status element = windowed_value.value size = self.signature.get_restriction_provider( ).restriction_size(element, deferred_restriction) residual_value = ((element, deferred_restriction), size) return SplitResultResidual( residual_value=windowed_value.with_value(residual_value), current_watermark=current_watermark, deferred_timestamp=deferred_timestamp) return None