def delete(self, actor_id): logger.debug("top of DELETE /actors/{}".format(actor_id)) id = Actor.get_dbid(g.tenant, actor_id) logger.info("calling shutdown_workers() for actor: {}".format(id)) shutdown_workers(id) logger.debug("shutdown_workers() done") try: actor = Actor.from_db(actors_store[id]) executions = actor.get('executions') or {} for ex_id, val in executions.items(): del logs_store[ex_id] except KeyError as e: logger.info("got KeyError {} trying to retrieve actor or executions with id {}".format( e, id)) # delete the actor's message channel # TODO - needs work; each worker is subscribed to the ActorMsgChannel. If the workers are not # closed before the ch.delete() below, the ActorMsgChannel will survive. try: ch = ActorMsgChannel(actor_id=id) ch.delete() logger.info("Deleted actor message channel for actor: {}".format(id)) except Exception as e: # if we get an error trying to remove the inbox, log it but keep going logger.error("Unable to delete the actor's message channel for actor: {}, exception: {}".format(id, e)) del actors_store[id] logger.info("actor {} deleted from store.".format(id)) del permissions_store[id] logger.info("actor {} permissions deleted from store.".format(id)) del nonce_store[id] logger.info("actor {} nonnces delete from nonce store.".format(id)) return ok(result=None, msg='Actor deleted successfully.')
def subscribe(actor_id, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ actor_ch = ActorMsgChannel(actor_id) t = threading.Thread(target=process_worker_ch, args=(worker_ch, actor_id, actor_ch)) t.start() print("Worker subscribing to actor channel...") while keep_running: update_worker_status(actor_id, worker_ch.name, READY) try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue print("Received message {}. Starting actor container...".format(str(msg))) message = msg.pop("msg", "") try: stats, logs = execute_actor(actor_id, worker_ch, image, message, msg) except DockerStartContainerError as e: print("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # add the execution to the actor store print("Actor container finished successfully. Got stats object:{}".format(str(stats))) exc_id = Execution.add_execution(actor_id, stats) Execution.set_logs(exc_id, logs)
def get(self, actor_id): def get_hypermedia(actor): return { '_links': { 'self': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), }, } logger.debug("top of GET /actors/{}/messages".format(actor_id)) # check that actor exists id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) ch = ActorMsgChannel(actor_id=id) result = {'messages': len(ch._queue._queue)} ch.close() logger.debug("messages found for actor: {}.".format(actor_id)) result.update(get_hypermedia(actor)) return ok(result)
def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" logger.debug("Top of stop_workers() for actor: {}.".format(actor_id)) try: workers_dict = workers_store[actor_id] except KeyError: logger.debug("workers_store had no workers for actor: {}".format(actor_id)) workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: logger.info("Found {} workers to stop.".format(len(workers_dict.items()))) # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() logger.info("Actor channel closed for actor: {}".format(actor_id)) # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(worker_id=worker['id']) # since this is an update, there are new workers being started, so # don't delete the actor msg channel: ch.put('stop-no-delete') logger.info("Sent 'stop-no-delete' message to worker_id: {}".format(worker['id'])) ch.close() else: logger.debug("skipping worker {} as it it not in worker_ids.".format(worker)) else: logger.info("No workers to stop.")
def post(self, actor_id): def get_hypermedia(actor, exc): return { '_links': { 'self': '{}/actors/v2/{}/executions/{}'.format( actor.api_server, actor.id, exc), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id) }, } args = self.validate_post() d = {} # build a dictionary of k:v pairs from the query parameters, and pass a single # additional object 'message' from within the post payload. Note that 'message' # need not be JSON data. for k, v in request.args.items(): if k == 'message': continue d[k] = v if hasattr(g, 'user'): d['_abaco_username'] = g.user if hasattr(g, 'api_server'): d['_abaco_api_server'] = g.api_server # if hasattr(g, 'jwt'): # d['_abaco_jwt'] = g.jwt # if hasattr(g, 'jwt_server'): # d['_abaco_jwt_server'] = g.jwt_server if hasattr(g, 'jwt_header_name'): d['_abaco_jwt_header_name'] = g.jwt_header_name dbid = Actor.get_dbid(g.tenant, actor_id) # create an execution exc = Execution.add_execution( dbid, { 'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': g.user }) d['_abaco_execution_id'] = exc d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '') ch = ActorMsgChannel(actor_id=dbid) ch.put_msg(message=args['message'], d=d) # make sure at least one worker is available actor = Actor.from_db(actors_store[dbid]) actor.ensure_one_worker() result = {'execution_id': exc, 'msg': args['message']} result.update(get_hypermedia(actor, exc)) case = Config.get('web', 'case') if not case == 'camel': return ok(result) else: return ok(dict_to_camel(result))
def post(self, actor_id): args = self.validate_post() d = {} for k, v in request.args.items(): if k == 'message': continue d[k] = v ch = ActorMsgChannel(actor_id=actor_id) ch.put_msg(msg=args['message'], d=d) return ok(result={'msg': args['message']})
def create_gauges(actor_ids): logger.debug("METRICS: Made it to create_gauges") for actor_id in actor_ids: if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Gauge: {}".format( e)) else: g = message_gauges[actor_id] try: ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8")) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e result = {'messages': len(ch._queue._queue)} ch.close() g.set(result['messages']) logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: g = worker_gaueges[actor_id] workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} g.set(result['workers']) return actor_ids
def get(self): logger.debug("top of GET /admin/actors") actors = [] for k, v in actors_store.items(): actor = Actor.from_db(v) actor.workers = Worker.get_workers(actor.db_id) for id, worker in actor.workers.items(): actor.worker = worker break ch = ActorMsgChannel(actor_id=actor.db_id) actor.messages = len(ch._queue._queue) ch.close() summary = ExecutionsSummary(db_id=actor.db_id) actor.executions = summary.total_executions actor.runtime = summary.total_runtime actors.append(actor) logger.info("actors retrieved.") return ok(result=actors, msg="Actors retrieved successfully.")
def stop_workers(self, actor_id): """Stop existing workers; used when updating an actor's image.""" try: workers_dict = workers_store[actor_id] except KeyError: workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): ch = WorkerChannel(name=worker['ch_name']) ch.put('stop')
def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" try: workers_dict = workers_store[actor_id] except KeyError: workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(name=worker['ch_name']) ch.put('stop')
def stop_workers(self, actor_id): """Stop existing workers; used when updating an actor's image.""" try: workers = json.loads(workers_store[actor_id]) print("Found existing workers: {}".format(str(workers))) except KeyError: print("No existing workers.") workers = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers) > 0 : # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for worker in workers: ch = WorkerChannel(name=worker['ch_name']) ch.put('stop')
def get_metrics(self): logger.debug("top of get in MetricResource") actor_ids = [ db_id for db_id, _ in actors_store.items() ] logger.debug("ACTOR IDS: {}".format(actor_ids)) try: if actor_ids: for actor_id in actor_ids: if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format(actor_id.decode("utf-8").replace('-', '_')), 'Number of messages for actor {}'.format(actor_id.decode("utf-8").replace('-', '_')) ) message_gauges.update({actor_id: g}) except Exception as e: logger.info("got exception trying to instantiate the Gauge: {}".format(e)) else: g = message_gauges[actor_id] try: ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8")) except Exception as e: logger.error("Exception connecting to ActorMsgChannel: {}".format(e)) raise e result = {'messages': len(ch._queue._queue)} ch.close() g.set(result['messages']) logger.debug("METRICS: {} messages found for actor: {}.".format(result['messages'], actor_id)) return actor_ids except Exception as e: logger.info("Got exception in get_metrics: {}".format(e)) return []
def post(self, actor_id): def get_hypermedia(actor, exc): return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},} args = self.validate_post() d = {} # build a dictionary of k:v pairs from the query parameters, and pass a single # additional object 'message' from within the post payload. Note that 'message' # need not be JSON data. for k, v in request.args.items(): if k == 'message': continue d[k] = v if hasattr(g, 'user'): d['_abaco_username'] = g.user if hasattr(g, 'api_server'): d['_abaco_api_server'] = g.api_server # if hasattr(g, 'jwt'): # d['_abaco_jwt'] = g.jwt # if hasattr(g, 'jwt_server'): # d['_abaco_jwt_server'] = g.jwt_server if hasattr(g, 'jwt_header_name'): d['_abaco_jwt_header_name'] = g.jwt_header_name dbid = Actor.get_dbid(g.tenant, actor_id) # create an execution exc = Execution.add_execution(dbid, {'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': g.user}) d['_abaco_execution_id'] = exc d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '') ch = ActorMsgChannel(actor_id=dbid) ch.put_msg(message=args['message'], d=d) # make sure at least one worker is available workers = Worker.get_workers(dbid) actor = Actor.from_db(actors_store[dbid]) if len(workers.items()) < 1: ch = CommandChannel() ch.put_cmd(actor_id=dbid, image=actor.image, tenant=g.tenant, num=1, stop_existing=False) result={'execution_id': exc, 'msg': args['message']} result.update(get_hypermedia(actor, exc)) case = Config.get('web', 'case') if not case == 'camel': return ok(result) else: return ok(dict_to_camel(result))
def get(self, actor_id): def get_hypermedia(actor): return { '_links': { 'self': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), }, } # check that actor exists id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 404) result = {'messages': len(ActorMsgChannel(actor_id=id)._queue._queue)} result.update(get_hypermedia(actor)) return ok(result)
def process_link(link, msg, d): """ Process an event with a link. :return: """ # ensure that the linked actor still exists; the link attribute is *always* the dbid of the linked # actor logger.debug("top of process_link") try: actors_store[link] except KeyError as e: logger.error( "Processing event message for actor {} that does not exist. Quiting" .format(link)) raise e # create an execution for the linked actor with message exc = Execution.add_execution( link, { 'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': 'Abaco Event' }) logger.info( "Events processor agent added execution {} for actor {}".format( exc, link)) d['_abaco_execution_id'] = exc logger.debug( "sending message to actor. Final message {} and message dictionary: {}" .format(msg, d)) ch = ActorMsgChannel(actor_id=link) ch.put_msg(message=msg, d=d) ch.close() logger.info("link processed.")
def create_gauges(actor_ids): """ Creates a Prometheus gauge for each actor id. The gauge is used to track the number of pending messages in the actor's queue. :param actor_ids: list of actors that should be processed. Does not include stateful actors or actors in a shutting down state. :return: """ logger.debug("top of create_gauges; actor_ids: {}".format(actor_ids)) # dictionary mapping actor_ids to their message queue lengths inbox_lengths = {} for actor_id in actor_ids: logger.debug("top of for loop for actor_id: {}".format(actor_id)) # first, make sure the actor still exists in the actor store try: actor = actors_store[actor_id] except KeyError: logger.error( f"actor {actor_id} does not exist in store; continuing to next actor." ) continue # If the actor doesn't have a gauge, add one if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.error( "got exception trying to create/instantiate the gauge; " "actor {}; exception: {}".format(actor_id, e)) g = None else: # Otherwise, get this actor's existing gauge try: g = message_gauges[actor_id] except Exception as e: logger.info( "got exception trying to instantiate an existing gauge; " "actor: {}: exception:{}".format(actor_id, e)) g = None # Update this actor's gauge to its current # of messages try: ch = ActorMsgChannel(actor_id=actor_id) msg_length = len(ch._queue._queue) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e ch.close() result = {'messages': msg_length} # add the actor's current message queue length to the inbox_lengths in-memory variable inbox_lengths[actor_id] = msg_length # if we were able to create the gauge, set it to the current message: if g: try: g.set(result['messages']) except Exception as e: logger.error( f"Got exception trying to set the messages on the gauge for actor: {actor_id}; " f"exception: {e}") logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) # add a worker gauge for this actor if one does not exist if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: # Otherwise, get the worker gauge that already exists g = worker_gaueges[actor_id] # Update this actor's worker IDs workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} try: g.set(result['workers']) except Exception as e: logger.error( f"got exception trying to set the worker gauge for actor {actor_id}; exception: {e}" ) logger.debug( f"METRICS: {result['workers']} workers found for actor: {actor_id}." ) # Update this actor's command channel metric # channel_name = actor.get("queue") # # queues_list = Config.get('spawner', 'host_queues').replace(' ', '') # valid_queues = queues_list.split(',') # # if not channel_name or channel_name not in valid_queues: # channel_name = 'default' # # if not channel_name: # # TODO -- this must be changed. there is no way returning no arguments will result in # # anythng but an exception. The calling function is expecting 3 arguments... # # if we really want to blow up right here we should just raise an appropriate exception. # return # TODO -- this code needs to be fixed. What follows is only a partial fix; what I think we want to do # is set the length of all of the different command channels once at the end of this loop. What was # happening instead was that it was only setting one of the command channel's lengths -- whatever command # channel happened to belong to the last actor in the loop. channel_name = 'default' ch = CommandChannel(name=channel_name) cmd_length = len(ch._queue._queue) command_gauge.labels(channel_name).set(cmd_length) logger.debug( f"METRICS COMMAND CHANNEL {channel_name} size: {command_gauge}") ch.close() # Return actor_ids so we don't have to query for them again later return actor_ids, inbox_lengths, cmd_length
def create_gauges(actor_ids): logger.debug( "METRICS: Made it to create_gauges; actor_ids: {}".format(actor_ids)) inbox_lengths = {} for actor_id in actor_ids: logger.debug("top of for loop for actor_id: {}".format(actor_id)) try: actor = actors_store[actor_id] except KeyError: logger.error("actor {} does not exist.".format(actor_id)) continue # If the actor doesn't have a gauge, add one if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.error( "got exception trying to create/instantiate the gauge; " "actor {}; exception: {}".format(actor_id, e)) else: # Otherwise, get this actor's existing gauge try: g = message_gauges[actor_id] except Exception as e: logger.info( "got exception trying to instantiate an existing gauge; " "actor: {}: exception:{}".format(actor_id, e)) # Update this actor's command channel metric channel_name = actor.get("queue") queues_list = Config.get('spawner', 'host_queues').replace(' ', '') valid_queues = queues_list.split(',') if not channel_name or channel_name not in valid_queues: channel_name = 'default' # Update this actor's gauge to its current # of messages try: ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8")) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e result = {'messages': len(ch._queue._queue)} inbox_lengths[actor_id.decode("utf-8")] = len(ch._queue._queue) ch.close() g.set(result['messages']) logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) # add a worker gauge for this actor if one does not exist if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: # Otherwise, get the worker gauge that already exists g = worker_gaueges[actor_id] # Update this actor's worker IDs workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} g.set(result['workers']) ch = CommandChannel(name=channel_name) cmd_length = len(ch._queue._queue) command_gauge.labels(channel_name).set(cmd_length) logger.debug("METRICS COMMAND CHANNEL {} size: {}".format( channel_name, command_gauge)) ch.close() # Return actor_ids so we don't have to query for them again later return actor_ids, inbox_lengths, cmd_length
def post(self, actor_id): def get_hypermedia(actor, exc): return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},} logger.debug("top of POST /actors/{}/messages.".format(actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) args = self.validate_post() d = {} # build a dictionary of k:v pairs from the query parameters, and pass a single # additional object 'message' from within the post payload. Note that 'message' # need not be JSON data. logger.debug("POST body validated. actor: {}.".format(actor_id)) for k, v in request.args.items(): if k == 'message': continue d[k] = v logger.debug("extra fields added to message from query parameters: {}.".format(d)) if hasattr(g, 'user'): d['_abaco_username'] = g.user logger.debug("_abaco_username: {} added to message.".format(g.user)) if hasattr(g, 'api_server'): d['_abaco_api_server'] = g.api_server logger.debug("_abaco_api_server: {} added to message.".format(g.api_server)) # if hasattr(g, 'jwt'): # d['_abaco_jwt'] = g.jwt # if hasattr(g, 'jwt_server'): # d['_abaco_jwt_server'] = g.jwt_server if hasattr(g, 'jwt_header_name'): d['_abaco_jwt_header_name'] = g.jwt_header_name logger.debug("abaco_jwt_header_name: {} added to message.".format(g.jwt_header_name)) # create an execution exc = Execution.add_execution(dbid, {'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': g.user}) logger.info("Execution {} added for actor {}".format(exc, actor_id)) d['_abaco_execution_id'] = exc d['_abaco_Content_Type'] = args.get('_abaco_Content_Type', '') logger.debug("Final message dictionary: {}".format(d)) ch = ActorMsgChannel(actor_id=dbid) ch.put_msg(message=args['message'], d=d) ch.close() logger.debug("Message added to actor inbox. id: {}.".format(actor_id)) # make sure at least one worker is available actor = Actor.from_db(actors_store[dbid]) actor.ensure_one_worker() logger.debug("ensure_one_worker() called. id: {}.".format(actor_id)) if args.get('_abaco_Content_Type') == 'application/octet-stream': result = {'execution_id': exc, 'msg': 'binary - omitted'} else: result={'execution_id': exc, 'msg': args['message']} result.update(get_hypermedia(actor, exc)) case = Config.get('web', 'case') if not case == 'camel': return ok(result) else: return ok(dict_to_camel(result))
def subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe().") actor_ch = ActorMsgChannel(actor_id) try: leave_containers = Config.get('workers', 'leave_containers') except configparser.NoOptionError: logger.info("No leave_containers value confiured.") leave_containers = False if hasattr(leave_containers, 'lower'): leave_containers = leave_containers.lower() == "true" logger.info("leave_containers: {}".format(leave_containers)) ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") verify = get_tenant_verify(tenant) ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret, verify=verify) else: logger.info("Not creating agave client.") logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() logger.info("Worker subscribing to actor channel.") # keep track of whether we need to update the worker's status back to READY; otherwise, we # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s) update_worker_status = True # shared global tracking whether this worker should keep running; shared between this thread and # the "worker channel processing" thread. global keep_running # main subscription loop -- processing messages from actor's mailbox while keep_running: if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) update_worker_status = False try: msg = actor_ch.get_one() except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting...") keep_running = False sys.exit() logger.info("worker {} processing new msg.".format(worker_id)) try: Worker.update_worker_status(actor_id, worker_id, BUSY) except Exception as e: logger.error("unexpected exception from call to update_worker_status." "actor_id: {}; worker_id: {}; status: {}; exception: {}".format(actor_id, worker_id, BUSY, e)) raise e update_worker_status = True logger.info("Received message {}. Starting actor container...".format(msg)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] content_type = msg['_abaco_Content_Type'] mounts = actor.mounts logger.debug("actor mounts: {}".format(mounts)) # for results, create a socket in the configured directory. try: socket_host_path_dir = Config.get('workers', 'socket_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError): logger.error("No socket_host_path configured. Cannot manage results data.") Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for results data.") continue socket_host_path = '{}.sock'.format(os.path.join(socket_host_path_dir, worker_id, execution_id)) logger.info("Create socket at path: {}".format(socket_host_path)) # add the socket as a mount: mounts.append({'host_path': socket_host_path, 'container_path': '/_abaco_results.sock', 'format': 'ro'}) # for binary data, create a fifo in the configured directory. The configured # fifo_host_path_dir is equal to the fifo path in the worker container: fifo_host_path = None if content_type == 'application/octet-stream': try: fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError): logger.error("No fifo_host_path configured. Cannot manage binary data.") Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for binary data.") continue fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id) try: os.mkfifo(fifo_host_path) logger.info("Created fifo at path: {}".format(fifo_host_path)) except Exception as e: logger.error("Could not create fifo_path. Exception: {}".format(e)) raise e # add the fifo as a mount: mounts.append({'host_path': fifo_host_path, 'container_path': '/_abaco_binary_data', 'format': 'ro'}) # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug("Adding worker_id to execution.") Execution.add_worker_id(actor_id, execution_id, worker_id) # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if type(actor['privileged']) == bool and actor['privileged']: privileged = True logger.debug("privileged: {}".format(privileged)) # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # construct the user field from the actor's uid and gid: user = get_container_user(actor) logger.debug("Final user valiue: {}".format(user)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_actor_state'] = actor.state logger.debug("Overlayed environment: {}".format(environment)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info("Refreshed the tokens. Passed {} to the environment.".format(token)) except Exception as e: logger.error("Got an exception trying to get an access token: {}".format(e)) else: logger.info("Agave client `ag` is None -- not passing access token.") logger.info("Passing update environment: {}".format(environment)) try: stats, logs, final_state, exit_code, start_time = execute_actor(actor_id, worker_id, execution_id, image, message, user, environment, privileged, mounts, leave_containers, fifo_host_path, socket_host_path) except DockerStartContainerError as e: logger.error("Got DockerStartContainerError: {}".format(e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) continue # Add the completed stats to the execution logger.info("Actor container finished successfully. Got stats object:{}".format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time) logger.info("Added execution: {}".format(execution_id)) # Add the logs to the execution Execution.set_logs(execution_id, logs) logger.info("Added execution logs.") # Update the worker's last updated and last execution fields: try: Worker.update_worker_execution_time(actor_id, worker_id) except KeyError: # it is possible that this worker was sent a gracful shutdown command in the other thread # and that spawner has already removed this worker from the store. logger.info("worker {} got unexpected key error trying to update its execution time. " "Worker better be shutting down! keep_running: {}".format(worker_id, keep_running)) if keep_running: logger.error("worker couldn't update's its execution time but keep_running is still true!") logger.info("worker time stamps updated.")
def subscribe(tenant, actor_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ actor_ch = ActorMsgChannel(actor_id) ag = None if api_server and client_id and client_secret and access_token and refresh_token: ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret) else: print("Not creating agave client.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, actor_ch, ag)) t.start() print("Worker subscribing to actor channel...") global keep_running while keep_running: Worker.update_worker_status(actor_id, worker_ch.name, READY) try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue except channelpy.ChannelClosedException: print("Channel closed, worker exiting...") keep_running = False sys.exit() print("Received message {}. Starting actor container...".format(str(msg))) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] privileged = False if actor['privileged'] == 'TRUE': privileged = True environment = actor['default_environment'] print("Actor default environment: {}".format(environment)) print("Actor privileged: {}".format(privileged)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token print("Refreshed the tokens. Passed {} to the environment.".format(token)) except Exception as e: print("Got an exception trying to get an access token: {}".format(e)) else: print("Agave client `ag` is None -- not passing access token.") print("Passing update environment: {}".format(environment)) try: stats, logs = execute_actor(actor_id, worker_ch, image, message, environment, privileged) except DockerStartContainerError as e: print("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # add the execution to the actor store print("Actor container finished successfully. Got stats object:{}".format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats) print("Added execution: {}".format(execution_id)) Execution.set_logs(execution_id, logs) Worker.update_worker_execution_time(actor_id, worker_ch.name)
def subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe().") actor_ch = ActorMsgChannel(actor_id) ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret) else: logger.info("Not creating agave client.") logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() logger.info("Worker subscribing to actor channel.") update_worker_status = True global keep_running while keep_running: if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) update_worker_status = False try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting...") keep_running = False sys.exit() update_worker_status = True logger.info( "Received message {}. Starting actor container...".format(msg)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug("Adding worker_id to execution.") Execution.add_worker_id(actor_id, execution_id, worker_id) # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if actor['privileged'] == 'TRUE': privileged = True logger.debug("privileged: {}".format(privileged)) # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_actor_state'] = actor.state logger.debug("Overlayed environment: {}".format(environment)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info( "Refreshed the tokens. Passed {} to the environment.". format(token)) except Exception as e: logger.error( "Got an exception trying to get an access token: {}". format(e)) else: logger.info( "Agave client `ag` is None -- not passing access token.") logger.info("Passing update environment: {}".format(environment)) try: stats, logs, final_state, exit_code = execute_actor( actor_id, worker_id, worker_ch, image, message, environment, privileged) except DockerStartContainerError as e: logger.error("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # Add the completed stats to the execution logger.info( "Actor container finished successfully. Got stats object:{}". format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code) logger.info("Added execution: {}".format(execution_id)) # Add the logs to the execution Execution.set_logs(execution_id, logs) logger.info("Added execution logs.") # Update the worker's last updated and last execution fields: Worker.update_worker_execution_time(actor_id, worker_id) logger.info("worker time stamps updated.")
def subscribe(tenant, actor_id, image, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also launches a separate thread which ultimately subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe(). worker_id: {}".format(worker_id)) actor_ch = ActorMsgChannel(actor_id) # establish configs for this worker ------- try: leave_containers = Config.get('workers', 'leave_containers') except configparser.NoOptionError: logger.debug("No leave_containers value configured.") leave_containers = False if hasattr(leave_containers, 'lower'): leave_containers = leave_containers.lower() == "true" logger.debug("leave_containers: {}".format(leave_containers)) try: mem_limit = Config.get('workers', 'mem_limit') except configparser.NoOptionError: logger.debug("No mem_limit value configured.") mem_limit = "-1" mem_limit = str(mem_limit) try: max_cpus = Config.get('workers', 'max_cpus') except configparser.NoOptionError: logger.debug("No max_cpus value configured.") max_cpus = "-1" logger.debug("max_cpus: {}".format(max_cpus)) # instantiate an OAuth client python object if credentials were passed ----- ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") verify = get_tenant_verify(tenant) ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret, verify=verify) else: logger.info("Not creating agave client.") # start a separate thread for handling messages sent to the worker channel ---- logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() # subscribe to the actor message queue ----- logger.info( "Worker subscribing to actor channel. worker_id: {}".format(worker_id)) # keep track of whether we need to update the worker's status back to READY; otherwise, we # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s) update_worker_status = True # global tracks whether this worker should keep running. globals.keep_running = True # consecutive_errors tracks the number of consecutive times a worker has gotten an error trying to process a # message. Even though the message will be requeued, we do not want the worker to continue processing # indefinitely when a compute node is unhealthy. consecutive_errors = 0 # main subscription loop -- processing messages from actor's mailbox while globals.keep_running: logger.debug("top of keep_running; worker id: {}".format(worker_id)) if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) logger.debug( "updated worker status to READY in SUBSCRIBE; worker id: {}". format(worker_id)) update_worker_status = False try: msg, msg_obj = actor_ch.get_one() except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting. worker id: {}".format( worker_id)) globals.keep_running = False sys.exit() logger.info("worker {} processing new msg.".format(worker_id)) try: Worker.update_worker_status(actor_id, worker_id, BUSY) except Exception as e: logger.error( "unexpected exception from call to update_worker_status. Nacking message." "actor_id: {}; worker_id: {}; status: {}; exception: {}". format(actor_id, worker_id, BUSY, e)) logger.info("worker exiting. worker_id: {}".format(worker_id)) msg_obj.nack(requeue=True) raise e update_worker_status = True logger.info( "Received message {}. Starting actor container. worker id: {}". format(msg, worker_id)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') try: actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] content_type = msg['_abaco_Content_Type'] mounts = actor.mounts logger.debug("actor mounts: {}".format(mounts)) except Exception as e: logger.error( "unexpected exception retrieving actor, execution, content-type, mounts. Nacking message." "actor_id: {}; worker_id: {}; status: {}; exception: {}". format(actor_id, worker_id, BUSY, e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # for results, create a socket in the configured directory. try: socket_host_path_dir = Config.get('workers', 'socket_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error( "No socket_host_path configured. Cannot manage results data. Nacking message" ) Actor.set_status( actor_id, ERROR, status_message="Abaco instance not configured for results data." ) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e socket_host_path = '{}.sock'.format( os.path.join(socket_host_path_dir, worker_id, execution_id)) logger.info("Create socket at path: {}".format(socket_host_path)) # add the socket as a mount: mounts.append({ 'host_path': socket_host_path, 'container_path': '/_abaco_results.sock', 'format': 'ro' }) # for binary data, create a fifo in the configured directory. The configured # fifo_host_path_dir is equal to the fifo path in the worker container: fifo_host_path = None if content_type == 'application/octet-stream': try: fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error( "No fifo_host_path configured. Cannot manage binary data.") Actor.set_status( actor_id, ERROR, status_message= "Abaco instance not configured for binary data. Nacking message." ) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id) try: os.mkfifo(fifo_host_path) logger.info("Created fifo at path: {}".format(fifo_host_path)) except Exception as e: logger.error( "Could not create fifo_path. Nacking message. Exception: {}" .format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # add the fifo as a mount: mounts.append({ 'host_path': fifo_host_path, 'container_path': '/_abaco_binary_data', 'format': 'ro' }) # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug( "Adding worker_id to execution. woker_id: {}".format(worker_id)) try: Execution.add_worker_id(actor_id, execution_id, worker_id) except Exception as e: logger.error( "Unexpected exception adding working_id to the Execution. Nacking message. Exception: {}" .format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if type(actor['privileged']) == bool and actor['privileged']: privileged = True logger.debug("privileged: {}; worker_id: {}".format( privileged, worker_id)) # overlay resource limits if set on actor: if actor.mem_limit: mem_limit = actor.mem_limit if actor.max_cpus: max_cpus = actor.max_cpus # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # construct the user field from the actor's uid and gid: user = get_container_user(actor) logger.debug("Final user valiue: {}".format(user)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_worker_id'] = worker_id environment['_abaco_container_repo'] = actor.image environment['_abaco_actor_state'] = actor.state environment['_abaco_actor_name'] = actor.name or 'None' logger.debug("Overlayed environment: {}; worker_id: {}".format( environment, worker_id)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info( "Refreshed the tokens. Passed {} to the environment.". format(token)) except Exception as e: logger.error( "Got an exception trying to get an access token. Stoping worker and nacking message. " "Exception: {}".format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e else: logger.info( "Agave client `ag` is None -- not passing access token; worker_id: {}" .format(worker_id)) logger.info("Passing update environment: {}".format(environment)) logger.info("About to execute actor; worker_id: {}".format(worker_id)) try: stats, logs, final_state, exit_code, start_time = execute_actor( actor_id, worker_id, execution_id, image, message, user, environment, privileged, mounts, leave_containers, fifo_host_path, socket_host_path, mem_limit, max_cpus) except DockerStartContainerError as e: logger.error( "Worker {} got DockerStartContainerError: {} trying to start actor for execution {}." "Placing message back on queue.".format( worker_id, e, execution_id)) # if we failed to start the actor container, we leave the worker up and re-queue the original message msg_obj.nack(requeue=True) logger.debug('message requeued.') consecutive_errors += 1 if consecutive_errors > MAX_WORKER_CONSECUTIVE_ERRORS: logger.error( "Worker {} failed to successfully start actor for execution {} {} consecutive times; " "Exception: {}. Putting the actor in error status and shutting " "down workers.".format(worker_id, execution_id, MAX_WORKER_CONSECUTIVE_ERRORS, e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}; w".format(e)) shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break else: # sleep five seconds before getting a message again to give time for the compute # node and/or docker health to recover time.sleep(5) continue except DockerStopContainerError as e: logger.error( "Worker {} was not able to stop actor for execution: {}; Exception: {}. " "Putting the actor in error status and shutting down workers.". format(worker_id, execution_id, e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) # since the error was with stopping the actor, we will consider this message "processed"; this choice # could be reconsidered/changed msg_obj.ack() shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break except Exception as e: logger.error( "Worker {} got an unexpected exception trying to run actor for execution: {}." "Putting the actor in error status and shutting down workers. " "Exception: {}; type: {}".format(worker_id, execution_id, e, type(e))) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) # the execute_actor function raises a DockerStartContainerError if it met an exception before starting the # actor container; if the container was started, then another exception should be raised. Therefore, # we can assume here that the container was at least started and we can ack the message. msg_obj.ack() shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break # ack the message msg_obj.ack() logger.debug( "container finished successfully; worker_id: {}".format(worker_id)) # Add the completed stats to the execution logger.info( "Actor container finished successfully. Got stats object:{}". format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time) logger.info("Added execution: {}; worker_id: {}".format( execution_id, worker_id)) # Add the logs to the execution try: Execution.set_logs(execution_id, logs) logger.debug("Successfully added execution logs.") except Exception as e: msg = "Got exception trying to set logs for exception {}; " \ "Exception: {}; worker_id: {}".format(execution_id, e, worker_id) logger.error(msg) # Update the worker's last updated and last execution fields: try: Worker.update_worker_execution_time(actor_id, worker_id) logger.debug("worker execution time updated. worker_id: {}".format( worker_id)) except KeyError: # it is possible that this worker was sent a gracful shutdown command in the other thread # and that spawner has already removed this worker from the store. logger.info( "worker {} got unexpected key error trying to update its execution time. " "Worker better be shutting down! keep_running: {}".format( worker_id, globals.keep_running)) if globals.keep_running: logger.error( "worker couldn't update's its execution time but keep_running is still true!" ) # we completed an execution successfully; reset the consecutive_errors counter consecutive_errors = 0 logger.info( "worker time stamps updated; worker_id: {}".format(worker_id)) logger.info( "global.keep_running no longer true. worker is now exited. worker id: {}" .format(worker_id))
def subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ actor_ch = ActorMsgChannel(actor_id) ag = None if api_server and client_id and client_secret and access_token and refresh_token: ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret) else: print("Not creating agave client.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() print("Worker subscribing to actor channel...") global keep_running while keep_running: Worker.update_worker_status(actor_id, worker_id, READY) try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue except channelpy.ChannelClosedException: print("Channel closed, worker exiting...") keep_running = False sys.exit() print("Received message {}. Starting actor container...".format( str(msg))) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] Execution.add_worker_id(actor_id, execution_id, worker_id) privileged = False if actor['privileged'] == 'TRUE': privileged = True environment = actor['default_environment'] print("Actor default environment: {}".format(environment)) print("Actor privileged: {}".format(privileged)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_actor_state'] = actor.state # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token print("Refreshed the tokens. Passed {} to the environment.". format(token)) except Exception as e: print("Got an exception trying to get an access token: {}". format(e)) else: print("Agave client `ag` is None -- not passing access token.") print("Passing update environment: {}".format(environment)) try: stats, logs, final_state, exit_code = execute_actor( actor_id, worker_id, worker_ch, image, message, environment, privileged) except DockerStartContainerError as e: print("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # add the execution to the actor store print("Actor container finished successfully. Got stats object:{}". format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code) print("Added execution: {}".format(execution_id)) Execution.set_logs(execution_id, logs) Worker.update_worker_execution_time(actor_id, worker_id)