def __init__(self, control_address, worker_count, credentials=None): self._worker_count = worker_count self._worker_index = 0 if credentials is None: logging.info('Creating insecure control channel.') self._control_channel = grpc.insecure_channel(control_address) else: logging.info('Creating secure control channel.') self._control_channel = grpc.secure_channel( control_address, credentials) grpc.channel_ready_future(self._control_channel).result(timeout=60) logging.info('Control channel established.') self._control_channel = grpc.intercept_channel(self._control_channel, WorkerIdInterceptor()) self._data_channel_factory = data_plane.GrpcClientDataChannelFactory( credentials) self._state_handler_factory = GrpcStateHandlerFactory() self.workers = queue.Queue() # one thread is enough for getting the progress report. # Assumption: # Progress report generation should not do IO or wait on other resources. # Without wait, having multiple threads will not improve performance and # will only add complexity. self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1) self._process_thread_pool = futures.ThreadPoolExecutor( max_workers=self._worker_count) self._instruction_id_vs_worker = {} self._fns = {} self._responses = queue.Queue() self._process_bundle_queue = queue.Queue() self._unscheduled_process_bundle = set() logging.info('Initializing SDKHarness with %s workers.', self._worker_count)
def __init__(self, control_address): self._control_channel = grpc.insecure_channel(control_address) self._data_channel_factory = data_plane.GrpcClientDataChannelFactory() # TODO: Ensure thread safety to run with more than 1 thread. self._default_work_thread_pool = futures.ThreadPoolExecutor( max_workers=1) self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1)
def test_source_split(self): source = RangeSource(0, 100) expected_splits = list(source.split(30)) worker = sdk_harness.SdkWorker( None, data_plane.GrpcClientDataChannelFactory()) worker.register( beam_fn_api_pb2.RegisterRequest(process_bundle_descriptor=[ beam_fn_api_pb2.ProcessBundleDescriptor(primitive_transform=[ beam_fn_api_pb2.PrimitiveTransform( function_spec=sdk_harness.serialize_and_pack_py_fn( SourceBundle(1.0, source, None, None), sdk_harness.PYTHON_SOURCE_URN, id="src")) ]) ])) split_response = worker.initial_source_split( beam_fn_api_pb2.InitialSourceSplitRequest( desired_bundle_size_bytes=30, source_reference="src")) self.assertEqual(expected_splits, [ sdk_harness.unpack_and_deserialize_py_fn(s.source) for s in split_response.splits ]) self.assertEqual([s.weight for s in expected_splits], [s.relative_size for s in split_response.splits])
def __init__( self, control_address, worker_count, credentials=None, worker_id=None, # Caching is disabled by default state_cache_size=0, profiler_factory=None): self._alive = True self._worker_count = worker_count self._worker_index = 0 self._worker_id = worker_id self._state_cache = StateCache(state_cache_size) if credentials is None: logging.info('Creating insecure control channel for %s.', control_address) self._control_channel = GRPCChannelFactory.insecure_channel( control_address) else: logging.info('Creating secure control channel for %s.', control_address) self._control_channel = GRPCChannelFactory.secure_channel( control_address, credentials) grpc.channel_ready_future(self._control_channel).result(timeout=60) logging.info('Control channel established.') self._control_channel = grpc.intercept_channel( self._control_channel, WorkerIdInterceptor(self._worker_id)) self._data_channel_factory = data_plane.GrpcClientDataChannelFactory( credentials, self._worker_id) self._state_handler_factory = GrpcStateHandlerFactory( self._state_cache, credentials) self._profiler_factory = profiler_factory self._fns = {} # BundleProcessor cache across all workers. self._bundle_processor_cache = BundleProcessorCache( state_handler_factory=self._state_handler_factory, data_channel_factory=self._data_channel_factory, fns=self._fns) # workers for process/finalize bundle. self.workers = queue.Queue() # one worker for progress/split request. self.progress_worker = SdkWorker( self._bundle_processor_cache, profiler_factory=self._profiler_factory) # one thread is enough for getting the progress report. # Assumption: # Progress report generation should not do IO or wait on other resources. # Without wait, having multiple threads will not improve performance and # will only add complexity. self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1) # finalize and process share one thread pool. self._process_thread_pool = futures.ThreadPoolExecutor( max_workers=self._worker_count) self._responses = queue.Queue() self._process_bundle_queue = queue.Queue() self._unscheduled_process_bundle = {} logging.info('Initializing SDKHarness with %s workers.', self._worker_count)
def __init__(self, control_address, # type: str credentials=None, worker_id=None, # type: Optional[str] # Caching is disabled by default state_cache_size=0, # time-based data buffering is disabled by default data_buffer_time_limit_ms=0, profiler_factory=None, # type: Optional[Callable[..., Profile]] status_address=None, # type: Optional[str, unicode] ): self._alive = True self._worker_index = 0 self._worker_id = worker_id self._state_cache = StateCache(state_cache_size) if credentials is None: _LOGGER.info('Creating insecure control channel for %s.', control_address) self._control_channel = GRPCChannelFactory.insecure_channel( control_address) else: _LOGGER.info('Creating secure control channel for %s.', control_address) self._control_channel = GRPCChannelFactory.secure_channel( control_address, credentials) grpc.channel_ready_future(self._control_channel).result(timeout=60) _LOGGER.info('Control channel established.') self._control_channel = grpc.intercept_channel( self._control_channel, WorkerIdInterceptor(self._worker_id)) self._data_channel_factory = data_plane.GrpcClientDataChannelFactory( credentials, self._worker_id, data_buffer_time_limit_ms) self._state_handler_factory = GrpcStateHandlerFactory( self._state_cache, credentials) self._profiler_factory = profiler_factory self._fns = {} # type: Dict[str, beam_fn_api_pb2.ProcessBundleDescriptor] # BundleProcessor cache across all workers. self._bundle_processor_cache = BundleProcessorCache( state_handler_factory=self._state_handler_factory, data_channel_factory=self._data_channel_factory, fns=self._fns) if status_address: try: self._status_handler = FnApiWorkerStatusHandler( status_address, self._bundle_processor_cache) except Exception: traceback_string = traceback.format_exc() _LOGGER.warning( 'Error creating worker status request handler, ' 'skipping status report. Trace back: %s' % traceback_string) else: self._status_handler = None # TODO(BEAM-8998) use common UnboundedThreadPoolExecutor to process bundle # progress once dataflow runner's excessive progress polling is removed. self._report_progress_executor = futures.ThreadPoolExecutor(max_workers=1) self._worker_thread_pool = UnboundedThreadPoolExecutor() self._responses = queue.Queue( ) # type: queue.Queue[beam_fn_api_pb2.InstructionResponse] _LOGGER.info('Initializing SDKHarness with unbounded number of workers.')
def __init__( self, control_address, credentials=None, worker_id=None, # Caching is disabled by default state_cache_size=0, profiler_factory=None): self._alive = True self._worker_index = 0 self._worker_id = worker_id self._state_cache = StateCache(state_cache_size) if credentials is None: _LOGGER.info('Creating insecure control channel for %s.', control_address) self._control_channel = GRPCChannelFactory.insecure_channel( control_address) else: _LOGGER.info('Creating secure control channel for %s.', control_address) self._control_channel = GRPCChannelFactory.secure_channel( control_address, credentials) grpc.channel_ready_future(self._control_channel).result(timeout=60) _LOGGER.info('Control channel established.') self._control_channel = grpc.intercept_channel( self._control_channel, WorkerIdInterceptor(self._worker_id)) self._data_channel_factory = data_plane.GrpcClientDataChannelFactory( credentials, self._worker_id) self._state_handler_factory = GrpcStateHandlerFactory( self._state_cache, credentials) self._profiler_factory = profiler_factory self._fns = {} # BundleProcessor cache across all workers. self._bundle_processor_cache = BundleProcessorCache( state_handler_factory=self._state_handler_factory, data_channel_factory=self._data_channel_factory, fns=self._fns) # TODO(BEAM-8998) use common UnboundedThreadPoolExecutor to process bundle # progress once dataflow runner's excessive progress polling is removed. self._report_progress_executor = futures.ThreadPoolExecutor( max_workers=1) self._worker_thread_pool = UnboundedThreadPoolExecutor() self._responses = queue.Queue() _LOGGER.info( 'Initializing SDKHarness with unbounded number of workers.')
def __init__(self, control_address, worker_count): self._worker_count = worker_count self._worker_index = 0 self._control_channel = grpc.insecure_channel(control_address) self._data_channel_factory = data_plane.GrpcClientDataChannelFactory() self.workers = queue.Queue() # one thread is enough for getting the progress report. # Assumption: # Progress report generation should not do IO or wait on other resources. # Without wait, having multiple threads will not improve performance and # will only add complexity. self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1) self._process_thread_pool = futures.ThreadPoolExecutor( max_workers=self._worker_count) self._instruction_id_vs_worker = {} self._fns = {} self._responses = queue.Queue() self._process_bundle_queue = queue.Queue() logging.info('Initializing SDKHarness with %s workers.', self._worker_count)
def __init__(self, control_address, # type: str credentials=None, worker_id=None, # type: Optional[str] # Caching is disabled by default state_cache_size=0, profiler_factory=None # type: Optional[Callable[..., Profile]] ): self._alive = True self._worker_index = 0 self._worker_id = worker_id self._state_cache = StateCache(state_cache_size) if credentials is None: _LOGGER.info('Creating insecure control channel for %s.', control_address) self._control_channel = GRPCChannelFactory.insecure_channel( control_address) else: _LOGGER.info('Creating secure control channel for %s.', control_address) self._control_channel = GRPCChannelFactory.secure_channel( control_address, credentials) grpc.channel_ready_future(self._control_channel).result(timeout=60) _LOGGER.info('Control channel established.') self._control_channel = grpc.intercept_channel( self._control_channel, WorkerIdInterceptor(self._worker_id)) self._data_channel_factory = data_plane.GrpcClientDataChannelFactory( credentials, self._worker_id) self._state_handler_factory = GrpcStateHandlerFactory(self._state_cache, credentials) self._profiler_factory = profiler_factory self._fns = {} # type: Dict[str, beam_fn_api_pb2.ProcessBundleDescriptor] # BundleProcessor cache across all workers. self._bundle_processor_cache = BundleProcessorCache( state_handler_factory=self._state_handler_factory, data_channel_factory=self._data_channel_factory, fns=self._fns) self._worker_thread_pool = UnboundedThreadPoolExecutor() self._responses = queue.Queue() # type: queue.Queue[beam_fn_api_pb2.InstructionResponse] _LOGGER.info('Initializing SDKHarness with unbounded number of workers.')
def __init__( self, control_address, # type: str credentials=None, # type: Optional[grpc.ChannelCredentials] worker_id=None, # type: Optional[str] # Caching is disabled by default state_cache_size=0, # type: int # time-based data buffering is disabled by default data_buffer_time_limit_ms=0, # type: int profiler_factory=None, # type: Optional[Callable[..., Profile]] status_address=None, # type: Optional[str] # Heap dump through status api is disabled by default enable_heap_dump=False, # type: bool ): # type: (...) -> None self._alive = True self._worker_index = 0 self._worker_id = worker_id self._state_cache = StateCache(state_cache_size) options = [('grpc.max_receive_message_length', -1), ('grpc.max_send_message_length', -1)] if credentials is None: _LOGGER.info('Creating insecure control channel for %s.', control_address) self._control_channel = GRPCChannelFactory.insecure_channel( control_address, options=options) else: _LOGGER.info('Creating secure control channel for %s.', control_address) self._control_channel = GRPCChannelFactory.secure_channel( control_address, credentials, options=options) grpc.channel_ready_future(self._control_channel).result(timeout=60) _LOGGER.info('Control channel established.') self._control_channel = grpc.intercept_channel( self._control_channel, WorkerIdInterceptor(self._worker_id)) self._data_channel_factory = data_plane.GrpcClientDataChannelFactory( credentials, self._worker_id, data_buffer_time_limit_ms) self._state_handler_factory = GrpcStateHandlerFactory( self._state_cache, credentials) self._profiler_factory = profiler_factory def default_factory(id): # type: (str) -> beam_fn_api_pb2.ProcessBundleDescriptor return self._control_stub.GetProcessBundleDescriptor( beam_fn_api_pb2.GetProcessBundleDescriptorRequest( process_bundle_descriptor_id=id)) self._fns = KeyedDefaultDict(default_factory) # BundleProcessor cache across all workers. self._bundle_processor_cache = BundleProcessorCache( state_handler_factory=self._state_handler_factory, data_channel_factory=self._data_channel_factory, fns=self._fns) if status_address: try: self._status_handler = FnApiWorkerStatusHandler( status_address, self._bundle_processor_cache, enable_heap_dump) # type: Optional[FnApiWorkerStatusHandler] except Exception: traceback_string = traceback.format_exc() _LOGGER.warning( 'Error creating worker status request handler, ' 'skipping status report. Trace back: %s' % traceback_string) else: self._status_handler = None # TODO(BEAM-8998) use common # thread_pool_executor.shared_unbounded_instance() to process bundle # progress once dataflow runner's excessive progress polling is removed. self._report_progress_executor = futures.ThreadPoolExecutor(max_workers=1) self._worker_thread_pool = thread_pool_executor.shared_unbounded_instance() self._responses = queue.Queue( ) # type: queue.Queue[Union[beam_fn_api_pb2.InstructionResponse, Sentinel]] _LOGGER.info('Initializing SDKHarness with unbounded number of workers.')
def __init__(self, control_address): self._control_channel = grpc.insecure_channel(control_address) self._data_channel_factory = data_plane.GrpcClientDataChannelFactory()
def __init__(self, control_channel): self._control_channel = control_channel self._data_channel_factory = data_plane.GrpcClientDataChannelFactory()