def test_window(self): l = [ GlobalWindows.windowed_value('a', 100), GlobalWindows.windowed_value('b', 200), GlobalWindows.windowed_value('c', 300) ] expected = [ TestWindowedValue(('a', 100, GlobalWindow()), 100, [GlobalWindow()]), TestWindowedValue(('b', 200, GlobalWindow()), 200, [GlobalWindow()]), TestWindowedValue(('c', 300, GlobalWindow()), 300, [GlobalWindow()]) ] with TestPipeline() as p: pc = p | beam.Create(l) # Map(lambda x: x) PTransform is added after Create here, because when # a PCollection of WindowedValues is created with Create PTransform, # the windows are not assigned to it. Adding a Map forces the # PCollection to go through a DoFn so that the PCollection consists of # the elements with timestamps assigned to them instead of a PCollection # of WindowedValue(element, timestamp, window). pc = pc | beam.Map(lambda x: x) reified_pc = pc | util.Reify.Window() assert_that(reified_pc, equal_to(expected), reify_windows=True)
def test_window(self): l = [GlobalWindows.windowed_value('a', 100), GlobalWindows.windowed_value('b', 200), GlobalWindows.windowed_value('c', 300)] expected = [TestWindowedValue(('a', 100, GlobalWindow()), 100, [GlobalWindow()]), TestWindowedValue(('b', 200, GlobalWindow()), 200, [GlobalWindow()]), TestWindowedValue(('c', 300, GlobalWindow()), 300, [GlobalWindow()])] with TestPipeline() as p: pc = p | beam.Create(l) reified_pc = pc | util.Reify.Window() assert_that(reified_pc, equal_to(expected), reify_windows=True)
def __iter__(self): output_stream = create_OutputStream() for encoded_key, values in self._table.items(): key = self._key_coder.decode(encoded_key) self._post_grouped_coder.get_impl().encode_to_stream( GlobalWindows.windowed_value((key, values)), output_stream, True) return iter([output_stream.get()])
def _read_values_to_bundles(reader): read_result = [GlobalWindows.windowed_value(e) for e in reader] return self._split_list_into_bundles( output_pcollection, read_result, _BoundedReadEvaluator.MAX_ELEMENT_PER_BUNDLE, lambda _: 1)
def test_update_multiple(self): coder = coders.PickleCoder() total_size = 0 opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0) self.verify_counters(opcounts, 0, float('nan')) value = GlobalWindows.windowed_value('abcde') opcounts.update_from(value) total_size += coder.estimate_size(value) value = GlobalWindows.windowed_value('defghij') opcounts.update_from(value) total_size += coder.estimate_size(value) self.verify_counters(opcounts, 2, float(total_size) / 2) value = GlobalWindows.windowed_value('klmnop') opcounts.update_from(value) total_size += coder.estimate_size(value) self.verify_counters(opcounts, 3, float(total_size) / 3)
def finish_bundle(self): data = self._read_from_pubsub(self.source.timestamp_attribute) if data: output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) # TODO(ccy): Respect the PubSub source's id_label field. for timestamp, message in data: if self.source.with_attributes: element = message else: element = message.payload bundle.output( GlobalWindows.windowed_value(element, timestamp=timestamp)) bundles = [bundle] else: bundles = [] if self._applied_ptransform.inputs: input_pvalue = self._applied_ptransform.inputs[0] else: input_pvalue = pvalue.PBegin( self._applied_ptransform.transform.pipeline) unprocessed_bundle = self._evaluation_context.create_bundle( input_pvalue) # TODO(udim): Correct value for watermark hold. return TransformResult(self, bundles, [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
def test_update_int(self): opcounts = OperationCounters(CounterFactory(), 'some-name', coders.PickleCoder(), 0) self.verify_counters(opcounts, 0) opcounts.update_from(GlobalWindows.windowed_value(1)) opcounts.update_collect() self.verify_counters(opcounts, 1)
def test_window_in_value(self): l = [GlobalWindows.windowed_value(('a', 1), 100), GlobalWindows.windowed_value(('b', 2), 200), GlobalWindows.windowed_value(('c', 3), 300)] expected = [TestWindowedValue(('a', (1, 100, GlobalWindow())), 100, [GlobalWindow()]), TestWindowedValue(('b', (2, 200, GlobalWindow())), 200, [GlobalWindow()]), TestWindowedValue(('c', (3, 300, GlobalWindow())), 300, [GlobalWindow()])] with TestPipeline() as p: # Map(lambda x: x) hack is used for the same reason here. # Also, this makes the typehint on Reify.WindowInValue work. pc = p | beam.Create(l) | beam.Map(lambda x: x) reified_pc = pc | util.Reify.WindowInValue() assert_that(reified_pc, equal_to(expected), reify_windows=True)
def finish_bundle(self): data = self._read_from_pubsub(self.source.timestamp_attribute) if data: output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) # TODO(ccy): Respect the PubSub source's id_label field. for timestamp, message in data: if self.source.with_attributes: element = message else: element = message.data bundle.output( GlobalWindows.windowed_value(element, timestamp=timestamp)) bundles = [bundle] else: bundles = [] if self._applied_ptransform.inputs: input_pvalue = self._applied_ptransform.inputs[0] else: input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline) unprocessed_bundle = self._evaluation_context.create_bundle( input_pvalue) # TODO(udim): Correct value for watermark hold. return TransformResult(self, bundles, [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
def process_element(self, element): # The index into the TestStream list of events. self.current_index = element.value # The watermark of the _TestStream transform itself. self.watermark = element.timestamp # We can either have the _TestStream or the _WatermarkController to emit # the elements. We chose to emit in the _WatermarkController so that the # element is emitted at the correct watermark value. for event in self.test_stream.events(self.current_index): if isinstance(event, (ElementEvent, WatermarkEvent)): # The WATERMARK_CONTROL_TAG is used to hold the _TestStream's # watermark to -inf, then +inf-1, then +inf. This watermark progression # is ultimately used to set up the proper holds to allow the # _WatermarkControllers to control their own output watermarks. if event.tag == _TestStream.WATERMARK_CONTROL_TAG: self.watermark = event.new_watermark else: main_output = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(main_output) bundle.output(GlobalWindows.windowed_value(event)) self.bundles.append(bundle) elif isinstance(event, ProcessingTimeEvent): self._evaluation_context._watermark_manager._clock.advance_time( event.advance_by) else: raise ValueError('Invalid TestStream event: %s.' % event)
def test_update_multiple(self): coder = coders.PickleCoder() total_size = 0 opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0) self.verify_counters(opcounts, 0, float('nan')) value = GlobalWindows.windowed_value('abcde') opcounts.update_from(value) total_size += coder.estimate_size(value) value = GlobalWindows.windowed_value('defghij') opcounts.update_from(value) total_size += coder.estimate_size(value) self.verify_counters(opcounts, 2, (float(total_size) / 2)) value = GlobalWindows.windowed_value('klmnop') opcounts.update_from(value) total_size += coder.estimate_size(value) self.verify_counters(opcounts, 3, (float(total_size) / 3))
def get_root_bundles(self): test_stream = self._applied_ptransform.transform bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) bundle.add(GlobalWindows.windowed_value(test_stream.begin(), timestamp=MIN_TIMESTAMP)) bundle.commit(None) return [bundle]
def test_update_str(self): coder = coders.PickleCoder() opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0) self.verify_counters(opcounts, 0, float('nan')) value = GlobalWindows.windowed_value('abcde') opcounts.update_from(value) estimated_size = coder.estimate_size(value) self.verify_counters(opcounts, 1, estimated_size)
def test_update_old_object(self): coder = coders.PickleCoder() opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0) self.verify_counters(opcounts, 0, float('nan')) obj = OldClassThatDoesNotImplementLen() value = GlobalWindows.windowed_value(obj) opcounts.update_from(value) estimated_size = coder.estimate_size(value) self.verify_counters(opcounts, 1, estimated_size)
def finish_bundle(self): for destination, file_path_writer in \ iteritems(self._destination_to_file_writer): (file_path, writer) = file_path_writer file_size = writer.tell() writer.close() yield pvalue.TaggedOutput(WriteRecordsToFile.WRITTEN_FILE_TAG, GlobalWindows.windowed_value( (destination, (file_path, file_size)))) self._destination_to_file_writer = {}
def finish_bundle(self): bundles = [] bundle = None for encoded_k, vs in iteritems(self.gbk_items): if not bundle: bundle = self._evaluation_context.create_bundle(self.output_pcollection) bundles.append(bundle) kwi = KeyedWorkItem(encoded_k, elements=vs) bundle.add(GlobalWindows.windowed_value(kwi)) return TransformResult(self, bundles, [], None, None)
def finish_bundle(self): bundles = [] transform = self._applied_ptransform.transform assert transform.value is not None create_result = [GlobalWindows.windowed_value(v) for v in transform.value] for result in create_result: self.bundle.output(result) bundles.append(self.bundle) return TransformResult( self._applied_ptransform, bundles, None, None, None, None)
def finish_bundle(self): unprocessed_bundles = [] hold = None if self.current_index < len(self.test_stream.events) - 1: unprocessed_bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) unprocessed_bundle.add(GlobalWindows.windowed_value( self.current_index + 1, timestamp=self.watermark)) unprocessed_bundles.append(unprocessed_bundle) hold = self.watermark return TransformResult( self._applied_ptransform, self.bundles, unprocessed_bundles, None, hold)
def get_root_bundles(self): test_stream = self._applied_ptransform.transform bundles = [] if len(test_stream.events) > 0: bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) # Explicitly set timestamp to MIN_TIMESTAMP to ensure that we hold the # watermark. bundle.add(GlobalWindows.windowed_value(0, timestamp=MIN_TIMESTAMP)) bundle.commit(None) bundles.append(bundle) return bundles
def finish_bundle(self): bundles = [] bundle = None for encoded_k, vs in iteritems(self.gbk_items): if not bundle: bundle = self._evaluation_context.create_bundle( self.output_pcollection) bundles.append(bundle) kwi = KeyedWorkItem(encoded_k, elements=vs) bundle.add(GlobalWindows.windowed_value(kwi)) return TransformResult(self, bundles, [], None, None)
def finish_bundle(self): unprocessed_bundles = [] next_index = self.test_stream.next(self.current_index) if not self.test_stream.end(next_index): unprocessed_bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) unprocessed_bundle.add(GlobalWindows.windowed_value( next_index, timestamp=self.watermark)) unprocessed_bundles.append(unprocessed_bundle) # Returning the watermark in the dict here is used as a watermark hold. return TransformResult( self, self.bundles, unprocessed_bundles, None, {None: self.watermark})
def finish_bundle(self): unprocessed_bundles = [] hold = None if self.current_index < len(self.test_stream.events) - 1: unprocessed_bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) unprocessed_bundle.add(GlobalWindows.windowed_value( self.current_index + 1, timestamp=self.watermark)) unprocessed_bundles.append(unprocessed_bundle) hold = self.watermark return TransformResult( self._applied_ptransform, self.bundles, unprocessed_bundles, None, {None: hold})
def process_element(self, element): # In order to keep the order of the elements between the script and what # flows through the pipeline the same, emit the elements here. event = element.value if isinstance(event, WatermarkEvent): self._watermark = event.new_watermark elif isinstance(event, ElementEvent): main_output = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(main_output) for tv in event.timestamped_values: bundle.output( GlobalWindows.windowed_value(tv.value, timestamp=tv.timestamp)) self.bundles.append(bundle)
def finish_bundle(self): bundles = [] transform = self._applied_ptransform.transform assert transform.value is not None create_result = [ GlobalWindows.windowed_value(v) for v in transform.value ] for result in create_result: self.bundle.output(result) bundles.append(self.bundle) return TransformResult(self._applied_ptransform, bundles, None, None, None, None)
def __iter__(self): output_stream = create_OutputStream() if self._windowing.is_default(): globally_window = GlobalWindows.windowed_value(None).with_value windowed_key_values = lambda key, values: [globally_window((key, values))] else: trigger_driver = trigger.create_trigger_driver(self._windowing, True) windowed_key_values = trigger_driver.process_entire_key coder_impl = self._post_grouped_coder.get_impl() key_coder_impl = self._key_coder.get_impl() for encoded_key, windowed_values in self._table.items(): key = key_coder_impl.decode(encoded_key) for wkvs in windowed_key_values(key, windowed_values): coder_impl.encode_to_stream(wkvs, output_stream, True) return iter([output_stream.get()])
def process_element(self, element): # The watermark of the _TestStream transform itself. self.watermark = element.timestamp # Set up the correct watermark holds in the Watermark controllers and the # TestStream so that the watermarks will not automatically advance to +inf # when elements start streaming. This can happen multiple times in the first # bundle, but the operations are idempotent and adding state to keep track # of this would add unnecessary code complexity. events = [] if self.watermark == MIN_TIMESTAMP: for event in self.test_stream._set_up( self.test_stream.output_tags): events.append(event) # Retrieve the TestStream's event stream and read from it. try: events.append(next(self.event_stream)) except StopIteration: # Advance the watermarks to +inf to cleanly stop the pipeline. self.is_done = True events += ([ e for e in self.test_stream._tear_down( self.test_stream.output_tags) ]) for event in events: # We can either have the _TestStream or the _WatermarkController to emit # the elements. We chose to emit in the _WatermarkController so that the # element is emitted at the correct watermark value. if isinstance(event, (ElementEvent, WatermarkEvent)): # The WATERMARK_CONTROL_TAG is used to hold the _TestStream's # watermark to -inf, then +inf-1, then +inf. This watermark progression # is ultimately used to set up the proper holds to allow the # _WatermarkControllers to control their own output watermarks. if event.tag == _TestStream.WATERMARK_CONTROL_TAG: self.watermark = event.new_watermark else: main_output = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle( main_output) bundle.output(GlobalWindows.windowed_value(event)) self.bundles.append(bundle) elif isinstance(event, ProcessingTimeEvent): self._evaluation_context._watermark_manager._clock.advance_time( event.advance_by) else: raise ValueError('Invalid TestStream event: %s.' % event)
def finish_bundle(self): unprocessed_bundles = [] # Continue to send its own state to itself via an unprocessed bundle. This # acts as a heartbeat, where each element will read the next event from the # event stream. if not self.is_done: unprocessed_bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) unprocessed_bundle.add( GlobalWindows.windowed_value(b'', timestamp=self.watermark)) unprocessed_bundles.append(unprocessed_bundle) # Returning the watermark in the dict here is used as a watermark hold. return TransformResult(self, self.bundles, unprocessed_bundles, None, {None: self.watermark})
def _flush_batch(self, destination): # Flush the current batch of rows to BigQuery. rows = self._rows_buffer[destination] table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') logging.debug('Flushing data to %s. Total %s rows.', destination, len(rows)) while True: # TODO: Figure out an insertId to make calls idempotent. passed, errors = self.bigquery_wrapper.insert_rows( project_id=table_reference.projectId, dataset_id=table_reference.datasetId, table_id=table_reference.tableId, rows=rows, skip_invalid_rows=True) logging.debug("Passed: %s. Errors are %s", passed, errors) failed_rows = [rows[entry.index] for entry in errors] should_retry = any( bigquery_tools.RetryStrategy.should_retry( self._retry_strategy, entry.errors[0].reason) for entry in errors) rows = failed_rows if not should_retry: break else: retry_backoff = next(self._backoff_calculator) logging.info('Sleeping %s seconds before retrying insertion.', retry_backoff) time.sleep(retry_backoff) self._total_buffered_rows -= len(self._rows_buffer[destination]) del self._rows_buffer[destination] return [ pvalue.TaggedOutput( BigQueryWriteFn.FAILED_ROWS, GlobalWindows.windowed_value((destination, row))) for row in failed_rows ]
def get_root_bundles(self): test_stream = self._applied_ptransform.transform # If there was an endpoint defined then get the events from the # TestStreamService. if test_stream.endpoint: _TestStreamEvaluator.event_stream = _TestStream.events_from_rpc( test_stream.endpoint, test_stream.output_tags, test_stream.coder) else: _TestStreamEvaluator.event_stream = ( _TestStream.events_from_script(test_stream._events)) bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) bundle.add(GlobalWindows.windowed_value(b'', timestamp=MIN_TIMESTAMP)) bundle.commit(None) return [bundle]
def _flush_batch(self, destination): # Flush the current batch of rows to BigQuery. rows = self._rows_buffer[destination] table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') logging.debug('Flushing data to %s. Total %s rows.', destination, len(rows)) while True: # TODO: Figure out an insertId to make calls idempotent. passed, errors = self.bigquery_wrapper.insert_rows( project_id=table_reference.projectId, dataset_id=table_reference.datasetId, table_id=table_reference.tableId, rows=rows, skip_invalid_rows=True) logging.debug("Passed: %s. Errors are %s", passed, errors) failed_rows = [rows[entry.index] for entry in errors] should_retry = any( bigquery_tools.RetryStrategy.should_retry( self._retry_strategy, entry.errors[0].reason) for entry in errors) rows = failed_rows if not should_retry: break else: retry_backoff = next(self._backoff_calculator) logging.info('Sleeping %s seconds before retrying insertion.', retry_backoff) time.sleep(retry_backoff) self._total_buffered_rows -= len(self._rows_buffer[destination]) del self._rows_buffer[destination] return [pvalue.TaggedOutput(BigQueryWriteFn.FAILED_ROWS, GlobalWindows.windowed_value( (destination, row))) for row in failed_rows]
def partition(self, n): # type: (int) -> List[List[bytes]] """ It is used to partition _GroupingBuffer to N parts. Once it is partitioned, it would not be re-partitioned with diff N. Re-partition is not supported now. """ if not self._grouped_output: if self._windowing.is_default(): globally_window = GlobalWindows.windowed_value( None, timestamp=GlobalWindow().max_timestamp(), pane_info=windowed_value.PaneInfo( is_first=True, is_last=True, timing=windowed_value.PaneInfoTiming.ON_TIME, index=0, nonspeculative_index=0)).with_value windowed_key_values = lambda key, values: [ globally_window((key, values)) ] else: # TODO(pabloem, BEAM-7514): Trigger driver needs access to the clock # note that this only comes through if windowing is default - but what # about having multiple firings on the global window. # May need to revise. trigger_driver = trigger.create_trigger_driver( self._windowing, True) windowed_key_values = trigger_driver.process_entire_key coder_impl = self._post_grouped_coder.get_impl() key_coder_impl = self._key_coder.get_impl() self._grouped_output = [[] for _ in range(n)] output_stream_list = [create_OutputStream() for _ in range(n)] for idx, (encoded_key, windowed_values) in enumerate(self._table.items()): key = key_coder_impl.decode(encoded_key) for wkvs in windowed_key_values(key, windowed_values): coder_impl.encode_to_stream(wkvs, output_stream_list[idx % n], True) for ix, output_stream in enumerate(output_stream_list): self._grouped_output[ix] = [output_stream.get()] self._table.clear() return self._grouped_output
def finish_bundle(self): if self._is_final_bundle(): if self.global_state.get_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG): # Ignore empty bundles after emitting output. (This may happen because # empty bundles do not affect input watermarks.) bundles = [] else: gbk_result = [] # TODO(ccy): perhaps we can clean this up to not use this # internal attribute of the DirectStepContext. for encoded_k in self.step_context.keyed_existing_state: # Ignore global state. if encoded_k is None: continue k = self.key_coder.decode(encoded_k) state = self.step_context.get_keyed_state(encoded_k) vs = state.get_state(None, _GroupByKeyOnlyEvaluator.ELEMENTS_TAG) gbk_result.append(GlobalWindows.windowed_value((k, vs))) def len_element_fn(element): _, v = element.value return len(v) bundles = self._split_list_into_bundles( self.output_pcollection, gbk_result, _GroupByKeyOnlyEvaluator.MAX_ELEMENT_PER_BUNDLE, len_element_fn) self.global_state.add_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG, True) hold = WatermarkManager.WATERMARK_POS_INF else: bundles = [] hold = WatermarkManager.WATERMARK_NEG_INF self.global_state.set_timer(None, '', TimeDomain.WATERMARK, WatermarkManager.WATERMARK_POS_INF) return TransformResult(self._applied_ptransform, bundles, [], None, {None: hold})
def finish_bundle(self): if self._is_final_bundle(): if self.global_state.get_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG): # Ignore empty bundles after emitting output. (This may happen because # empty bundles do not affect input watermarks.) bundles = [] else: gbk_result = [] # TODO(ccy): perhaps we can clean this up to not use this # internal attribute of the DirectStepContext. for encoded_k in self.step_context.keyed_existing_state: # Ignore global state. if encoded_k is None: continue k = self.key_coder.decode(encoded_k) state = self.step_context.get_keyed_state(encoded_k) vs = state.get_state(None, _GroupByKeyOnlyEvaluator.ELEMENTS_TAG) gbk_result.append(GlobalWindows.windowed_value((k, vs))) def len_element_fn(element): _, v = element.value return len(v) bundles = self._split_list_into_bundles( self.output_pcollection, gbk_result, _GroupByKeyOnlyEvaluator.MAX_ELEMENT_PER_BUNDLE, len_element_fn) self.global_state.add_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG, True) hold = WatermarkManager.WATERMARK_POS_INF else: bundles = [] hold = WatermarkManager.WATERMARK_NEG_INF self.global_state.set_timer( None, '', TimeDomain.WATERMARK, WatermarkManager.WATERMARK_POS_INF) return TransformResult( self._applied_ptransform, bundles, [], None, {None: hold})
def finish_bundle(self): data = self._read_from_pubsub() if data: output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) # TODO(ccy): we currently do not use the PubSub message timestamp or # respect the PubSub source's id_label field. now = Timestamp.of(time.time()) for message_data in data: bundle.output(GlobalWindows.windowed_value(message_data, timestamp=now)) bundles = [bundle] else: bundles = [] if self._applied_ptransform.inputs: input_pvalue = self._applied_ptransform.inputs[0] else: input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline) unprocessed_bundle = self._evaluation_context.create_bundle( input_pvalue) return TransformResult( self._applied_ptransform, bundles, [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
def process_element(self, element): index = element.value self.watermark = element.timestamp assert isinstance(index, int) assert 0 <= index <= len(self.test_stream.events) self.current_index = index event = self.test_stream.events[self.current_index] if isinstance(event, ElementEvent): assert len(self._outputs) == 1 output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) for tv in event.timestamped_values: bundle.output( GlobalWindows.windowed_value(tv.value, timestamp=tv.timestamp)) self.bundles.append(bundle) elif isinstance(event, WatermarkEvent): assert event.new_watermark >= self.watermark self.watermark = event.new_watermark elif isinstance(event, ProcessingTimeEvent): # TODO(ccy): advance processing time in the context's mock clock. pass else: raise ValueError('Invalid TestStream event: %s.' % event)
def _map_task_registration(self, map_task, state_handler, data_operation_spec): input_data = {} runner_sinks = {} transforms = [] transform_index_to_id = {} # Maps coders to new coder objects and references. coders = {} def coder_id(coder): if coder not in coders: coders[coder] = beam_fn_api_pb2.Coder( function_spec=sdk_worker.pack_function_spec_data( json.dumps(coder.as_cloud_object()), sdk_worker.PYTHON_CODER_URN, id=self._next_uid())) return coders[coder].function_spec.id def output_tags(op): return getattr(op, 'output_tags', ['out']) def as_target(op_input): input_op_index, input_output_index = op_input input_op = map_task[input_op_index][1] return { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=transform_index_to_id[ input_op_index], name=output_tags(input_op)[input_output_index]) ]) } def outputs(op): return { tag: beam_fn_api_pb2.PCollection(coder_reference=coder_id(coder)) for tag, coder in zip(output_tags(op), op.output_coders) } for op_ix, (stage_name, operation) in enumerate(map_task): transform_id = transform_index_to_id[op_ix] = self._next_uid() if isinstance(operation, operation_specs.WorkerInMemoryWrite): # Write this data back to the runner. fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_OUTPUT_URN, id=self._next_uid()) if data_operation_spec: fn.data.Pack(data_operation_spec) inputs = as_target(operation.input) side_inputs = {} runner_sinks[(transform_id, 'out')] = operation elif isinstance(operation, operation_specs.WorkerRead): # A Read is either translated to a direct injection of windowed values # into the sdk worker, or an injection of the source object into the # sdk worker as data followed by an SDF that reads that source. if (isinstance(operation.source.source, maptask_executor_runner.InMemorySource) and isinstance(operation.source.source.default_output_coder(), WindowedValueCoder)): output_stream = create_OutputStream() element_coder = ( operation.source.source.default_output_coder().get_impl()) # Re-encode the elements in the nested context and # concatenate them together for element in operation.source.source.read(None): element_coder.encode_to_stream(element, output_stream, True) target_name = self._next_uid() input_data[(transform_id, target_name)] = output_stream.get() fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid()) if data_operation_spec: fn.data.Pack(data_operation_spec) inputs = {target_name: beam_fn_api_pb2.Target.List()} side_inputs = {} else: # Read the source object from the runner. source_coder = beam.coders.DillCoder() input_transform_id = self._next_uid() output_stream = create_OutputStream() source_coder.get_impl().encode_to_stream( GlobalWindows.windowed_value(operation.source), output_stream, True) target_name = self._next_uid() input_data[(input_transform_id, target_name)] = output_stream.get() input_ptransform = beam_fn_api_pb2.PrimitiveTransform( id=input_transform_id, function_spec=beam_fn_api_pb2.FunctionSpec( urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid()), # TODO(robertwb): Possible name collision. step_name=stage_name + '/inject_source', inputs={target_name: beam_fn_api_pb2.Target.List()}, outputs={ 'out': beam_fn_api_pb2.PCollection( coder_reference=coder_id(source_coder)) }) if data_operation_spec: input_ptransform.function_spec.data.Pack(data_operation_spec) transforms.append(input_ptransform) # Read the elements out of the source. fn = sdk_worker.pack_function_spec_data( OLDE_SOURCE_SPLITTABLE_DOFN_DATA, sdk_worker.PYTHON_DOFN_URN, id=self._next_uid()) inputs = { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=input_transform_id, name='out') ]) } side_inputs = {} elif isinstance(operation, operation_specs.WorkerDoFn): fn = sdk_worker.pack_function_spec_data( operation.serialized_fn, sdk_worker.PYTHON_DOFN_URN, id=self._next_uid()) inputs = as_target(operation.input) # Store the contents of each side input for state access. for si in operation.side_inputs: assert isinstance(si.source, iobase.BoundedSource) element_coder = si.source.default_output_coder() view_id = self._next_uid() # TODO(robertwb): Actually flesh out the ViewFn API. side_inputs[si.tag] = beam_fn_api_pb2.SideInput( view_fn=sdk_worker.serialize_and_pack_py_fn( element_coder, urn=sdk_worker.PYTHON_ITERABLE_VIEWFN_URN, id=view_id)) # Re-encode the elements in the nested context and # concatenate them together output_stream = create_OutputStream() for element in si.source.read( si.source.get_range_tracker(None, None)): element_coder.get_impl().encode_to_stream( element, output_stream, True) elements_data = output_stream.get() state_key = beam_fn_api_pb2.StateKey.MultimapSideInput(key=view_id) state_handler.Clear(state_key) state_handler.Append(state_key, elements_data) elif isinstance(operation, operation_specs.WorkerFlatten): fn = sdk_worker.pack_function_spec_data( operation.serialized_fn, sdk_worker.IDENTITY_DOFN_URN, id=self._next_uid()) inputs = { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=transform_index_to_id[ input_op_index], name=output_tags(map_task[input_op_index][1])[ input_output_index]) for input_op_index, input_output_index in operation.inputs ]) } side_inputs = {} else: raise TypeError(operation) ptransform = beam_fn_api_pb2.PrimitiveTransform( id=transform_id, function_spec=fn, step_name=stage_name, inputs=inputs, side_inputs=side_inputs, outputs=outputs(operation)) transforms.append(ptransform) process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor( id=self._next_uid(), coders=coders.values(), primitive_transform=transforms) return beam_fn_api_pb2.InstructionRequest( instruction_id=self._next_uid(), register=beam_fn_api_pb2.RegisterRequest( process_bundle_descriptor=[process_bundle_descriptor ])), runner_sinks, input_data
def process_timer(self, timer_firing): """Default process_timer() impl. generating KeyedWorkItem element.""" self.process_element( GlobalWindows.windowed_value( KeyedWorkItem(timer_firing.encoded_key, timer_firings=[timer_firing])))
def set(self, ts): from apache_beam.transforms.window import GlobalWindows self._receiver.receive( GlobalWindows.windowed_value( (self._key, dict(timestamp=timestamp.Timestamp.of(ts)))))
from apache_beam.transforms.combiners import curry_combine_fn from apache_beam.transforms.window import GlobalWindows from apache_beam.utils.windowed_value import WindowedValue # Allow some "pure mode" declarations. try: import cython except ImportError: class FakeCython(object): @staticmethod def cast(type, value): return value globals()['cython'] = FakeCython() _globally_windowed_value = GlobalWindows.windowed_value(None) _global_window_type = type(_globally_windowed_value.windows[0]) class ConsumerSet(Receiver): """A ConsumerSet represents a graph edge between two Operation nodes. The ConsumerSet object collects information from the output of the Operation at one end of its edge and the input of the Operation at the other edge. ConsumerSet are attached to the outputting Operation. """ def __init__( self, counter_factory, step_name, output_index, consumers, coder): self.consumers = consumers
def finish_bundle(self): assert len(self._outputs) == 1 output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) bundle.output(GlobalWindows.windowed_value(b'')) return TransformResult(self, [bundle], [], None, None)