def scoped_state(self, name_context, state_name, io_target=None, metrics_container=None): """Returns a ScopedState object associated to a Step and a State. Args: name_context: common.NameContext. It is the step name information. state_name: str. It is the state name (e.g. process / start / finish). io_target: metrics_container: MetricsContainer. The step's metrics container. Returns: A ScopedState that keeps the execution context and is able to switch it for the execution thread. """ if not isinstance(name_context, common.NameContext): name_context = common.NameContext(name_context) counter_name = CounterName(state_name + '-msecs', stage_name=self._prefix, step_name=name_context.metrics_name(), io_target=io_target) if counter_name in self._states_by_name: return self._states_by_name[counter_name] else: output_counter = self._counter_factory.get_counter( counter_name, Counter.SUM) self._states_by_name[counter_name] = super( StateSampler, self)._scoped_state(counter_name, name_context, output_counter, metrics_container) return self._states_by_name[counter_name]
def __init__( self, name_context, # type: Union[str, common.NameContext] spec, counter_factory, state_sampler # type: StateSampler ): """Initializes a worker operation instance. Args: name_context: A NameContext instance or string(deprecated), with the name information for this operation. spec: A operation_specs.Worker* instance. counter_factory: The CounterFactory to use for our counters. state_sampler: The StateSampler for the current operation. """ if isinstance(name_context, common.NameContext): # TODO(BEAM-4028): Clean this up once it's completely migrated. # We use the specific operation name that is used for metrics and state # sampling. self.name_context = name_context else: self.name_context = common.NameContext(name_context) self.spec = spec self.counter_factory = counter_factory self.execution_context = None # type: Optional[ExecutionContext] self.consumers = collections.defaultdict( list) # type: DefaultDict[int, List[Operation]] # These are overwritten in the legacy harness. self.metrics_container = MetricsContainer( self.name_context.metrics_name()) self.state_sampler = state_sampler self.scoped_start_state = self.state_sampler.scoped_state( self.name_context, 'start', metrics_container=self.metrics_container) self.scoped_process_state = self.state_sampler.scoped_state( self.name_context, 'process', metrics_container=self.metrics_container) self.scoped_finish_state = self.state_sampler.scoped_state( self.name_context, 'finish', metrics_container=self.metrics_container) # TODO(ccy): the '-abort' state can be added when the abort is supported in # Operations. self.receivers = [] # type: List[ConsumerSet] # Legacy workers cannot call setup() until after setting additional state # on the operation. self.setup_done = False self.step_name = None # type: Optional[str]
def _create_user_defined_function_operation(factory, transform_proto, consumers, udfs_proto, beam_operation_cls, internal_operation_cls): output_tags = list(transform_proto.outputs.keys()) output_coders = factory.get_output_coders(transform_proto) spec = operation_specs.WorkerDoFn( serialized_fn=udfs_proto, output_tags=output_tags, input=None, side_inputs=None, output_coders=[output_coders[tag] for tag in output_tags]) name = common.NameContext(transform_proto.unique_name) serialized_fn = spec.serialized_fn if hasattr(serialized_fn, "key_type"): # keyed operation, need to create the KeyedStateBackend. row_schema = serialized_fn.key_type.row_schema key_row_coder = FlattenRowCoder( [from_proto(f.type) for f in row_schema.fields]) if serialized_fn.HasField('group_window'): if serialized_fn.group_window.is_time_window: window_coder = TimeWindowCoder() else: window_coder = CountWindowCoder() else: window_coder = None keyed_state_backend = RemoteKeyedStateBackend( factory.state_handler, key_row_coder, window_coder, serialized_fn.state_cache_size, serialized_fn.map_state_read_cache_size, serialized_fn.map_state_write_cache_size) return beam_operation_cls(name, spec, factory.counter_factory, factory.state_sampler, consumers, internal_operation_cls, keyed_state_backend) elif internal_operation_cls == datastream_operations.StatefulOperation: key_row_coder = from_type_info_proto(serialized_fn.key_type_info) keyed_state_backend = RemoteKeyedStateBackend( factory.state_handler, key_row_coder, None, serialized_fn.state_cache_size, serialized_fn.map_state_read_cache_size, serialized_fn.map_state_write_cache_size) return beam_operation_cls(name, spec, factory.counter_factory, factory.state_sampler, consumers, internal_operation_cls, keyed_state_backend) else: return beam_operation_cls(name, spec, factory.counter_factory, factory.state_sampler, consumers, internal_operation_cls)
def __init__(self, name_context, spec, counter_factory, state_sampler): """Initializes a worker operation instance. Args: name_context: A NameContext instance or string(deprecated), with the name information for this operation. spec: A operation_specs.Worker* instance. counter_factory: The CounterFactory to use for our counters. state_sampler: The StateSampler for the current operation. """ if isinstance(name_context, common.NameContext): # TODO(BEAM-4028): Clean this up once it's completely migrated. # We use the specific operation name that is used for metrics and state # sampling. self.name_context = name_context else: self.name_context = common.NameContext(name_context) # TODO(BEAM-4028): Remove following two lines. Rely on name context. self.operation_name = self.name_context.step_name self.step_name = self.name_context.logging_name() self.spec = spec self.counter_factory = counter_factory self.consumers = collections.defaultdict(list) # These are overwritten in the legacy harness. self.metrics_container = MetricsContainer( self.name_context.metrics_name()) # TODO(BEAM-4094): Remove ScopedMetricsContainer after Dataflow no longer # depends on it. self.scoped_metrics_container = ScopedMetricsContainer() self.state_sampler = state_sampler self.scoped_start_state = self.state_sampler.scoped_state( self.name_context.metrics_name(), 'start', metrics_container=self.metrics_container) self.scoped_process_state = self.state_sampler.scoped_state( self.name_context.metrics_name(), 'process', metrics_container=self.metrics_container) self.scoped_finish_state = self.state_sampler.scoped_state( self.name_context.metrics_name(), 'finish', metrics_container=self.metrics_container) # TODO(ccy): the '-abort' state can be added when the abort is supported in # Operations. self.receivers = []
def create_operation(name_context, spec, counter_factory, step_name=None, state_sampler=None, test_shuffle_source=None, test_shuffle_sink=None, is_streaming=False): # type: (...) -> Operation """Create Operation object for given operation specification.""" # TODO(pabloem): Document arguments to this function call. if not isinstance(name_context, common.NameContext): name_context = common.NameContext(step_name=name_context) if isinstance(spec, operation_specs.WorkerRead): if isinstance(spec.source, iobase.SourceBundle): op = ReadOperation(name_context, spec, counter_factory, state_sampler) # type: Operation else: from dataflow_worker.native_operations import NativeReadOperation op = NativeReadOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerWrite): from dataflow_worker.native_operations import NativeWriteOperation op = NativeWriteOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerCombineFn): op = CombineOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerPartialGroupByKey): op = create_pgbk_op(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerDoFn): op = DoOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerGroupingShuffleRead): from dataflow_worker.shuffle_operations import GroupedShuffleReadOperation op = GroupedShuffleReadOperation(name_context, spec, counter_factory, state_sampler, shuffle_source=test_shuffle_source) elif isinstance(spec, operation_specs.WorkerUngroupedShuffleRead): from dataflow_worker.shuffle_operations import UngroupedShuffleReadOperation op = UngroupedShuffleReadOperation(name_context, spec, counter_factory, state_sampler, shuffle_source=test_shuffle_source) elif isinstance(spec, operation_specs.WorkerInMemoryWrite): op = InMemoryWriteOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerShuffleWrite): from dataflow_worker.shuffle_operations import ShuffleWriteOperation op = ShuffleWriteOperation(name_context, spec, counter_factory, state_sampler, shuffle_sink=test_shuffle_sink) elif isinstance(spec, operation_specs.WorkerFlatten): op = FlattenOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerMergeWindows): from dataflow_worker.shuffle_operations import BatchGroupAlsoByWindowsOperation from dataflow_worker.shuffle_operations import StreamingGroupAlsoByWindowsOperation if is_streaming: op = StreamingGroupAlsoByWindowsOperation(name_context, spec, counter_factory, state_sampler) else: op = BatchGroupAlsoByWindowsOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerReifyTimestampAndWindows): from dataflow_worker.shuffle_operations import ReifyTimestampAndWindowsOperation op = ReifyTimestampAndWindowsOperation(name_context, spec, counter_factory, state_sampler) else: raise TypeError( 'Expected an instance of operation_specs.Worker* class ' 'instead of %s' % (spec, )) return op