Exemplo n.º 1
0
  def create_data_channel(self, remote_grpc_port):
    url = remote_grpc_port.api_service_descriptor.url
    if url not in self._data_channel_cache:
      with self._lock:
        if url not in self._data_channel_cache:
          logging.info('Creating channel for %s', url)
          # Options to have no limits (-1) on the size of the messages
          # received or sent over the data plane. The actual buffer size
          # is controlled in a layer above.
          channel_options = [("grpc.max_receive_message_length", -1),
                             ("grpc.max_send_message_length", -1)]
          grpc_channel = None
          if self._credentials is None:
            grpc_channel = GRPCChannelFactory.insecure_channel(
                url, options=channel_options)
          else:
            grpc_channel = GRPCChannelFactory.secure_channel(
                url, self._credentials, options=channel_options)
          # Add workerId to the grpc channel
          grpc_channel = grpc.intercept_channel(grpc_channel,
                                                WorkerIdInterceptor())
          self._data_channel_cache[url] = GrpcClientDataChannel(
              beam_fn_api_pb2_grpc.BeamFnDataStub(grpc_channel))

    return self._data_channel_cache[url]
Exemplo n.º 2
0
 def create_state_handler(self, api_service_descriptor):
     if not api_service_descriptor:
         return self._throwing_state_handler
     url = api_service_descriptor.url
     if url not in self._state_handler_cache:
         with self._lock:
             if url not in self._state_handler_cache:
                 # Options to have no limits (-1) on the size of the messages
                 # received or sent over the data plane. The actual buffer size is
                 # controlled in a layer above.
                 options = [('grpc.max_receive_message_length', -1),
                            ('grpc.max_send_message_length', -1)]
                 if self._credentials is None:
                     _LOGGER.info('Creating insecure state channel for %s.',
                                  url)
                     grpc_channel = GRPCChannelFactory.insecure_channel(
                         url, options=options)
                 else:
                     _LOGGER.info('Creating secure state channel for %s.',
                                  url)
                     grpc_channel = GRPCChannelFactory.secure_channel(
                         url, self._credentials, options=options)
                 _LOGGER.info('State channel established.')
                 # Add workerId to the grpc channel
                 grpc_channel = grpc.intercept_channel(
                     grpc_channel, WorkerIdInterceptor())
                 self._state_handler_cache[url] = CachingStateHandler(
                     self._state_cache,
                     GrpcStateHandler(
                         beam_fn_api_pb2_grpc.BeamFnStateStub(
                             grpc_channel)))
     return self._state_handler_cache[url]
Exemplo n.º 3
0
    def create_data_channel(self, remote_grpc_port):
        # type: (beam_fn_api_pb2.RemoteGrpcPort) -> GrpcClientDataChannel
        url = remote_grpc_port.api_service_descriptor.url
        if url not in self._data_channel_cache:
            with self._lock:
                if url not in self._data_channel_cache:
                    _LOGGER.info('Creating client data channel for %s', url)
                    # Options to have no limits (-1) on the size of the messages
                    # received or sent over the data plane. The actual buffer size
                    # is controlled in a layer above.
                    channel_options = [("grpc.max_receive_message_length", -1),
                                       ("grpc.max_send_message_length", -1)]
                    grpc_channel = None
                    if self._credentials is None:
                        grpc_channel = GRPCChannelFactory.insecure_channel(
                            url, options=channel_options)
                    else:
                        grpc_channel = GRPCChannelFactory.secure_channel(
                            url, self._credentials, options=channel_options)
                    # Add workerId to the grpc channel
                    grpc_channel = grpc.intercept_channel(
                        grpc_channel, WorkerIdInterceptor(self._worker_id))
                    self._data_channel_cache[url] = GrpcClientDataChannel(
                        beam_fn_api_pb2_grpc.BeamFnDataStub(grpc_channel),
                        self._data_buffer_time_limit_ms)

        return self._data_channel_cache[url]
Exemplo n.º 4
0
 def create_state_handler(self, api_service_descriptor):
   if not api_service_descriptor:
     return self._throwing_state_handler
   url = api_service_descriptor.url
   if url not in self._state_handler_cache:
     with self._lock:
       if url not in self._state_handler_cache:
         # Options to have no limits (-1) on the size of the messages
         # received or sent over the data plane. The actual buffer size is
         # controlled in a layer above.
         options = [('grpc.max_receive_message_length', -1),
                    ('grpc.max_send_message_length', -1)]
         if self._credentials is None:
           logging.info('Creating insecure state channel for %s.', url)
           grpc_channel = GRPCChannelFactory.insecure_channel(
               url, options=options)
         else:
           logging.info('Creating secure state channel for %s.', url)
           grpc_channel = GRPCChannelFactory.secure_channel(
               url, self._credentials, options=options)
         logging.info('State channel established.')
         # Add workerId to the grpc channel
         grpc_channel = grpc.intercept_channel(grpc_channel,
                                               WorkerIdInterceptor())
         self._state_handler_cache[url] = GrpcStateHandler(
             beam_fn_api_pb2_grpc.BeamFnStateStub(grpc_channel))
   return self._state_handler_cache[url]
Exemplo n.º 5
0
    def __init__(
            self,
            control_address,
            worker_count,
            credentials=None,
            worker_id=None,
            # Caching is disabled by default
            state_cache_size=0,
            profiler_factory=None):
        self._alive = True
        self._worker_count = worker_count
        self._worker_index = 0
        self._worker_id = worker_id
        self._state_cache = StateCache(state_cache_size)
        if credentials is None:
            logging.info('Creating insecure control channel for %s.',
                         control_address)
            self._control_channel = GRPCChannelFactory.insecure_channel(
                control_address)
        else:
            logging.info('Creating secure control channel for %s.',
                         control_address)
            self._control_channel = GRPCChannelFactory.secure_channel(
                control_address, credentials)
        grpc.channel_ready_future(self._control_channel).result(timeout=60)
        logging.info('Control channel established.')

        self._control_channel = grpc.intercept_channel(
            self._control_channel, WorkerIdInterceptor(self._worker_id))
        self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
            credentials, self._worker_id)
        self._state_handler_factory = GrpcStateHandlerFactory(
            self._state_cache, credentials)
        self._profiler_factory = profiler_factory
        self._fns = {}
        # BundleProcessor cache across all workers.
        self._bundle_processor_cache = BundleProcessorCache(
            state_handler_factory=self._state_handler_factory,
            data_channel_factory=self._data_channel_factory,
            fns=self._fns)
        # workers for process/finalize bundle.
        self.workers = queue.Queue()
        # one worker for progress/split request.
        self.progress_worker = SdkWorker(
            self._bundle_processor_cache,
            profiler_factory=self._profiler_factory)
        # one thread is enough for getting the progress report.
        # Assumption:
        # Progress report generation should not do IO or wait on other resources.
        #  Without wait, having multiple threads will not improve performance and
        #  will only add complexity.
        self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1)
        # finalize and process share one thread pool.
        self._process_thread_pool = futures.ThreadPoolExecutor(
            max_workers=self._worker_count)
        self._responses = queue.Queue()
        self._process_bundle_queue = queue.Queue()
        self._unscheduled_process_bundle = {}
        logging.info('Initializing SDKHarness with %s workers.',
                     self._worker_count)
Exemplo n.º 6
0
  def __init__(self,
               control_address,  # type: str
               credentials=None,
               worker_id=None,  # type: Optional[str]
               # Caching is disabled by default
               state_cache_size=0,
               # time-based data buffering is disabled by default
               data_buffer_time_limit_ms=0,
               profiler_factory=None,  # type: Optional[Callable[..., Profile]]
               status_address=None,  # type: Optional[str, unicode]
               ):
    self._alive = True
    self._worker_index = 0
    self._worker_id = worker_id
    self._state_cache = StateCache(state_cache_size)
    if credentials is None:
      _LOGGER.info('Creating insecure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.insecure_channel(
          control_address)
    else:
      _LOGGER.info('Creating secure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.secure_channel(
          control_address, credentials)
    grpc.channel_ready_future(self._control_channel).result(timeout=60)
    _LOGGER.info('Control channel established.')

    self._control_channel = grpc.intercept_channel(
        self._control_channel, WorkerIdInterceptor(self._worker_id))
    self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
        credentials, self._worker_id, data_buffer_time_limit_ms)
    self._state_handler_factory = GrpcStateHandlerFactory(
        self._state_cache, credentials)
    self._profiler_factory = profiler_factory
    self._fns = {}  # type: Dict[str, beam_fn_api_pb2.ProcessBundleDescriptor]
    # BundleProcessor cache across all workers.
    self._bundle_processor_cache = BundleProcessorCache(
        state_handler_factory=self._state_handler_factory,
        data_channel_factory=self._data_channel_factory,
        fns=self._fns)

    if status_address:
      try:
        self._status_handler = FnApiWorkerStatusHandler(
            status_address, self._bundle_processor_cache)
      except Exception:
        traceback_string = traceback.format_exc()
        _LOGGER.warning(
            'Error creating worker status request handler, '
            'skipping status report. Trace back: %s' % traceback_string)
    else:
      self._status_handler = None

    # TODO(BEAM-8998) use common UnboundedThreadPoolExecutor to process bundle
    #  progress once dataflow runner's excessive progress polling is removed.
    self._report_progress_executor = futures.ThreadPoolExecutor(max_workers=1)
    self._worker_thread_pool = UnboundedThreadPoolExecutor()
    self._responses = queue.Queue(
    )  # type: queue.Queue[beam_fn_api_pb2.InstructionResponse]
    _LOGGER.info('Initializing SDKHarness with unbounded number of workers.')
Exemplo n.º 7
0
    def __init__(
            self,
            control_address,
            credentials=None,
            worker_id=None,
            # Caching is disabled by default
            state_cache_size=0,
            profiler_factory=None):
        self._alive = True
        self._worker_index = 0
        self._worker_id = worker_id
        self._state_cache = StateCache(state_cache_size)
        if credentials is None:
            _LOGGER.info('Creating insecure control channel for %s.',
                         control_address)
            self._control_channel = GRPCChannelFactory.insecure_channel(
                control_address)
        else:
            _LOGGER.info('Creating secure control channel for %s.',
                         control_address)
            self._control_channel = GRPCChannelFactory.secure_channel(
                control_address, credentials)
        grpc.channel_ready_future(self._control_channel).result(timeout=60)
        _LOGGER.info('Control channel established.')

        self._control_channel = grpc.intercept_channel(
            self._control_channel, WorkerIdInterceptor(self._worker_id))
        self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
            credentials, self._worker_id)
        self._state_handler_factory = GrpcStateHandlerFactory(
            self._state_cache, credentials)
        self._profiler_factory = profiler_factory
        self._fns = {}
        # BundleProcessor cache across all workers.
        self._bundle_processor_cache = BundleProcessorCache(
            state_handler_factory=self._state_handler_factory,
            data_channel_factory=self._data_channel_factory,
            fns=self._fns)

        # TODO(BEAM-8998) use common UnboundedThreadPoolExecutor to process bundle
        #  progress once dataflow runner's excessive progress polling is removed.
        self._report_progress_executor = futures.ThreadPoolExecutor(
            max_workers=1)
        self._worker_thread_pool = UnboundedThreadPoolExecutor()
        self._responses = queue.Queue()
        _LOGGER.info(
            'Initializing SDKHarness with unbounded number of workers.')
Exemplo n.º 8
0
    def __init__(self,
                 control_address,
                 worker_count,
                 credentials=None,
                 worker_id=None,
                 profiler_factory=None):
        self._alive = True
        self._worker_count = worker_count
        self._worker_index = 0
        self._worker_id = worker_id
        if credentials is None:
            logging.info('Creating insecure control channel for %s.',
                         control_address)
            self._control_channel = GRPCChannelFactory.insecure_channel(
                control_address)
        else:
            logging.info('Creating secure control channel for %s.',
                         control_address)
            self._control_channel = GRPCChannelFactory.secure_channel(
                control_address, credentials)
        grpc.channel_ready_future(self._control_channel).result(timeout=60)
        logging.info('Control channel established.')

        self._control_channel = grpc.intercept_channel(
            self._control_channel, WorkerIdInterceptor(self._worker_id))
        self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
            credentials)
        self._state_handler_factory = GrpcStateHandlerFactory(credentials)
        self._profiler_factory = profiler_factory
        self.workers = queue.Queue()
        # one thread is enough for getting the progress report.
        # Assumption:
        # Progress report generation should not do IO or wait on other resources.
        #  Without wait, having multiple threads will not improve performance and
        #  will only add complexity.
        self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1)
        self._process_thread_pool = futures.ThreadPoolExecutor(
            max_workers=self._worker_count)
        self._instruction_id_vs_worker = {}
        self._fns = {}
        self._responses = queue.Queue()
        self._process_bundle_queue = queue.Queue()
        self._unscheduled_process_bundle = {}
        logging.info('Initializing SDKHarness with %s workers.',
                     self._worker_count)
Exemplo n.º 9
0
  def __init__(
      self, control_address, worker_count, credentials=None, worker_id=None,
      profiler_factory=None):
    self._alive = True
    self._worker_count = worker_count
    self._worker_index = 0
    self._worker_id = worker_id
    if credentials is None:
      logging.info('Creating insecure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.insecure_channel(
          control_address)
    else:
      logging.info('Creating secure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.secure_channel(
          control_address, credentials)
    grpc.channel_ready_future(self._control_channel).result(timeout=60)
    logging.info('Control channel established.')

    self._control_channel = grpc.intercept_channel(
        self._control_channel, WorkerIdInterceptor(self._worker_id))
    self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
        credentials)
    self._state_handler_factory = GrpcStateHandlerFactory(credentials)
    self._profiler_factory = profiler_factory
    self.workers = queue.Queue()
    # one thread is enough for getting the progress report.
    # Assumption:
    # Progress report generation should not do IO or wait on other resources.
    #  Without wait, having multiple threads will not improve performance and
    #  will only add complexity.
    self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1)
    self._process_thread_pool = futures.ThreadPoolExecutor(
        max_workers=self._worker_count)
    self._instruction_id_vs_worker = {}
    self._fns = {}
    self._responses = queue.Queue()
    self._process_bundle_queue = queue.Queue()
    self._unscheduled_process_bundle = {}
    logging.info('Initializing SDKHarness with %s workers.', self._worker_count)
Exemplo n.º 10
0
  def __init__(self,
               control_address,  # type: str
               credentials=None,
               worker_id=None,  # type: Optional[str]
               # Caching is disabled by default
               state_cache_size=0,
               profiler_factory=None  # type: Optional[Callable[..., Profile]]
              ):
    self._alive = True
    self._worker_index = 0
    self._worker_id = worker_id
    self._state_cache = StateCache(state_cache_size)
    if credentials is None:
      _LOGGER.info('Creating insecure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.insecure_channel(
          control_address)
    else:
      _LOGGER.info('Creating secure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.secure_channel(
          control_address, credentials)
    grpc.channel_ready_future(self._control_channel).result(timeout=60)
    _LOGGER.info('Control channel established.')

    self._control_channel = grpc.intercept_channel(
        self._control_channel, WorkerIdInterceptor(self._worker_id))
    self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
        credentials, self._worker_id)
    self._state_handler_factory = GrpcStateHandlerFactory(self._state_cache,
                                                          credentials)
    self._profiler_factory = profiler_factory
    self._fns = {}  # type: Dict[str, beam_fn_api_pb2.ProcessBundleDescriptor]
    # BundleProcessor cache across all workers.
    self._bundle_processor_cache = BundleProcessorCache(
        state_handler_factory=self._state_handler_factory,
        data_channel_factory=self._data_channel_factory,
        fns=self._fns)
    self._worker_thread_pool = UnboundedThreadPoolExecutor()
    self._responses = queue.Queue()  # type: queue.Queue[beam_fn_api_pb2.InstructionResponse]
    _LOGGER.info('Initializing SDKHarness with unbounded number of workers.')
Exemplo n.º 11
0
  def __init__(
      self,
      control_address,  # type: str
      credentials=None,  # type: Optional[grpc.ChannelCredentials]
      worker_id=None,  # type: Optional[str]
      # Caching is disabled by default
      state_cache_size=0,  # type: int
      # time-based data buffering is disabled by default
      data_buffer_time_limit_ms=0,  # type: int
      profiler_factory=None,  # type: Optional[Callable[..., Profile]]
      status_address=None,  # type: Optional[str]
      # Heap dump through status api is disabled by default
      enable_heap_dump=False,  # type: bool
  ):
    # type: (...) -> None
    self._alive = True
    self._worker_index = 0
    self._worker_id = worker_id
    self._state_cache = StateCache(state_cache_size)
    options = [('grpc.max_receive_message_length', -1),
               ('grpc.max_send_message_length', -1)]
    if credentials is None:
      _LOGGER.info('Creating insecure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.insecure_channel(
          control_address, options=options)
    else:
      _LOGGER.info('Creating secure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.secure_channel(
          control_address, credentials, options=options)
    grpc.channel_ready_future(self._control_channel).result(timeout=60)
    _LOGGER.info('Control channel established.')

    self._control_channel = grpc.intercept_channel(
        self._control_channel, WorkerIdInterceptor(self._worker_id))
    self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
        credentials, self._worker_id, data_buffer_time_limit_ms)
    self._state_handler_factory = GrpcStateHandlerFactory(
        self._state_cache, credentials)
    self._profiler_factory = profiler_factory

    def default_factory(id):
      # type: (str) -> beam_fn_api_pb2.ProcessBundleDescriptor
      return self._control_stub.GetProcessBundleDescriptor(
          beam_fn_api_pb2.GetProcessBundleDescriptorRequest(
              process_bundle_descriptor_id=id))

    self._fns = KeyedDefaultDict(default_factory)
    # BundleProcessor cache across all workers.
    self._bundle_processor_cache = BundleProcessorCache(
        state_handler_factory=self._state_handler_factory,
        data_channel_factory=self._data_channel_factory,
        fns=self._fns)

    if status_address:
      try:
        self._status_handler = FnApiWorkerStatusHandler(
            status_address, self._bundle_processor_cache,
            enable_heap_dump)  # type: Optional[FnApiWorkerStatusHandler]
      except Exception:
        traceback_string = traceback.format_exc()
        _LOGGER.warning(
            'Error creating worker status request handler, '
            'skipping status report. Trace back: %s' % traceback_string)
    else:
      self._status_handler = None

    # TODO(BEAM-8998) use common
    # thread_pool_executor.shared_unbounded_instance() to process bundle
    # progress once dataflow runner's excessive progress polling is removed.
    self._report_progress_executor = futures.ThreadPoolExecutor(max_workers=1)
    self._worker_thread_pool = thread_pool_executor.shared_unbounded_instance()
    self._responses = queue.Queue(
    )  # type: queue.Queue[Union[beam_fn_api_pb2.InstructionResponse, Sentinel]]
    _LOGGER.info('Initializing SDKHarness with unbounded number of workers.')