def _build_remote_agent(self, zoo_manager_conns): # Get a random zoo manager connection. zoo_manager_conn = random.choice(zoo_manager_conns) # Spawn remote worker and get its port. retries = 3 worker_port = None for retry in range(retries): try: response = zoo_manager_conn["stub"].spawn_worker( manager_pb2.Machine()) worker_port = response.num break except grpc.RpcError as e: self._log.debug( f"Failed {retry+1}/{retries} times in attempt to spawn a remote worker process. {e}" ) if worker_port == None: raise RemoteAgentException( "Remote worker process could not be spawned by the zoo manager." ) # Instantiate and return a local RemoteAgent. return RemoteAgent(zoo_manager_conn["address"], (zoo_manager_conn["address"][0], worker_port))
def acquire_remote_agent(self, retries: int = 3, timeout: Optional[float] = None) -> RemoteAgent: """Creates RemoteAgent objects. Args: retries (int, optional): Number of attempts in creating or connecting to an available RemoteAgent. Defaults to 3. timeout (Optional[float], optional): Time (seconds) to wait in acquiring a RemoteAgent. Defaults to None, which does not timeout. Raises: RemoteAgentException: If fail to acquire a RemoteAgent. Returns: RemoteAgent: A new RemoteAgent object. """ if timeout == None: timeout = self._timeout for retry in range(retries): try: return self._try_to_acquire_remote_agent(timeout) except Exception as e: self._log.debug( f"Failed {retry+1}/{retries} times in acquiring remote agent. {repr(e)}" ) time.sleep(0.1) raise RemoteAgentException("Failed to acquire remote agent.")
def get_manager_channel_stub(addr): channel = grpc.insecure_channel(f"{addr[0]}:{addr[1]}") try: # Wait until the grpc server is ready or timeout after 30 seconds grpc.channel_ready_future(channel).result(timeout=30) except grpc.FutureTimeoutError: raise RemoteAgentException( "Timeout in connecting to remote zoo manager.") stub = manager_pb2_grpc.ManagerStub(channel) return channel, stub
def acquire_remote_agent(self, retries=3) -> RemoteAgent: for retry in range(retries): try: return self._try_to_acquire_remote_agent() except Exception as e: self._log.debug( f"Failed {retry+1}/{retries} times in acquiring remote agent. {repr(e)}" ) time.sleep(0.1) raise RemoteAgentException("Failed to acquire remote agent.")
def get_manager_channel_stub(addr: Tuple[str, int], timeout: float = 10): """Connects to the gRPC server at `addr` and returns the channel and stub. Args: addr (Tuple[str,int]): gRPC server address. timeout (float, optional): Time to wait for the gRPC server to be ready. Defaults to 10. Raises: RemoteAgentException: If timeout occurs while connecting to the gRPC server. Returns: grpc.Channel: Channel to the gRPC server. manager_pb2_grpc.ManagerStub : gRPC stub. """ channel = grpc.insecure_channel(f"{addr[0]}:{addr[1]}") try: grpc.channel_ready_future(channel).result(timeout=timeout) except grpc.FutureTimeoutError: raise RemoteAgentException( "Timeout in connecting to remote zoo manager.") stub = manager_pb2_grpc.ManagerStub(channel) return channel, stub