def start_worker(self, image, tenant, worker_id): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, ch.name, worker_id) worker = Worker(tenant=tenant, **worker_dict) print( "worker started successfully, waiting on ack that image was pulled..." ) result = ch.get() if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "got an error back from the worker. Message: {}", format( result) print(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: raise SpawnerException( message="Internal error starting worker process.") elif result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format( str(result)) print(msg) raise SpawnerException(msg)
def start_worker(self, image, tenant, worker_id): ch = SpawnerWorkerChannel(worker_id=worker_id) # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, worker_id) worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) worker = Worker(tenant=tenant, **worker_dict) logger.info( "worker started successfully, waiting on ack that image was pulled..." ) result = ch.get() logger.debug( "Got response back from worker. Response: {}".format(result)) if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "Got an error back from the worker. Message: {}", format( result) logger.info(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: logger.error( "Spawner received invalid message from worker. 'msg' field missing. Message: {}" .format(result)) raise SpawnerException( message="Internal error starting worker process.") elif result['value']['status'] == 'ok': logger.debug("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format( str(result)) logger.error( "Spawner received an invalid message from worker. Message: ". format(result)) raise SpawnerException(msg)
def start_worker(self, image): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker = run_worker(image, ch._name) print("worker started successfully, waiting on ack that image was pulled...") result = ch.get() if result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: print("Got an error status from worker: {}. Raising an exception.".format(str(result))) raise SpawnerException()
def start_worker(self, image, tenant): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, ch.name) worker = Worker(tenant=tenant, **worker_dict) print("worker started successfully, waiting on ack that image was pulled...") result = ch.get() if result["value"]["status"] == "ok": print("received ack from worker.") return ch, result["reply_to"], worker else: print("Got an error status from worker: {}. Raising an exception.".format(str(result))) raise SpawnerException()
def start_worker(self, image, tenant, actor_id, worker_id): ch = SpawnerWorkerChannel(worker_id=worker_id) # start an actor executor container and wait for a confirmation that image was pulled. attempts = 0 while True: try: worker_dict = run_worker(image, actor_id, worker_id) except DockerError as e: logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e)) if 'read timeout' in e.message: logger.info("Exception was a read timeout; trying run_worker again..") time.sleep(5) attempts = attempts + 1 if attempts > 20: msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e) logger.critical(msg) raise SpawnerException(msg) continue else: logger.info("Exception was NOT a read timeout; quiting on this worker.") # delete this worker from the workers store: try: self.kill_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) raise SpawnerException(message="Unable to start worker; error: {}".format(e)) break worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) worker = Worker(tenant=tenant, **worker_dict) logger.info("worker started successfully, waiting on ack that image was pulled...") result = ch.get() logger.debug("Got response back from worker. Response: {}".format(result)) if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "Got an error back from the worker. Message: {}",format(result) logger.info(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: logger.error("Spawner received invalid message from worker. 'msg' field missing. Message: {}".format(result)) raise SpawnerException(message="Internal error starting worker process.") elif result['value']['status'] == 'ok': logger.debug("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format(str(result)) logger.error("Spawner received an invalid message from worker. Message: ".format(result)) raise SpawnerException(msg)
def start_worker(self, image, tenant, actor_id, worker_id, client_id, client_access_token, client_refresh_token, ch, api_server, client_secret): # start an actor executor container and wait for a confirmation that image was pulled. attempts = 0 # worker = get_worker(worker_id) # worker['status'] = PULLING_IMAGE Worker.update_worker_status(actor_id, worker_id, PULLING_IMAGE) try: logger.debug("Worker pulling image {}...".format(image)) pull_image(image) except DockerError as e: # return a message to the spawner that there was an error pulling image and abort # this is not necessarily an error state: the user simply could have provided an # image name that does not exist in the registry. This is the first time we would # find that out. logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e)) raise e logger.info("Image {} pulled successfully.".format(image)) # Done pulling image # Run Worker Container while True: try: Worker.update_worker_status(actor_id, worker_id, CREATING_CONTAINER) logger.debug('spawner creating worker container') worker_dict = run_worker( image, actor_id, worker_id, client_id, client_access_token, client_refresh_token, tenant, api_server, client_secret ) logger.debug(f'finished run worker; worker dict: {worker_dict}') except DockerError as e: logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e)) if 'read timeout' in e.message: logger.info("Exception was a read timeout; trying run_worker again..") time.sleep(5) attempts = attempts + 1 if attempts > 20: msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e) logger.critical(msg) # todo - should we be calling kill_worker here? (it is called in the exception block of the else below) raise SpawnerException(msg) continue else: logger.info("Exception was NOT a read timeout; quiting on this worker.") # delete this worker from the workers store: try: self.kill_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) raise SpawnerException(message="Unable to start worker; error: {}".format(e)) break logger.debug('finished loop') worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) # if the actor is not already in READY status, set actor status to READY before worker status has been # set to READY. # it is possible the actor status is already READY because this request is the autoscaler starting a new worker # for an existing actor. actor = Actor.from_db(actors_store[actor_id]) if not actor.status == READY: try: Actor.set_status(actor_id, READY, status_message=" ") except KeyError: # it is possible the actor was already deleted during worker start up; if # so, the worker should have a stop message waiting for it. starting subscribe # as usual should allow this process to work as expected. pass # finalize worker with READY status worker = Worker(tenant=tenant, **worker_dict) logger.info("calling add_worker for worker: {}.".format(worker)) Worker.add_worker(actor_id, worker) ch.put('READY') # step 4 logger.info('sent message through channel')