def put(self, actor_id): dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 404) previous_image = actor.image args = self.validate_put(actor) args['tenant'] = g.tenant update_image = False if args['image'] == previous_image: args['status'] = actor.status else: update_image = True args['status'] = SUBMITTED args['api_server'] = g.api_server args['owner'] = g.user actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() worker_ids = Worker.request_worker(actor.db_id) if update_image: ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant']) # return ok(result={'update_image': str(update_image)}, # msg="Actor updated successfully.") return ok(result=actor.display(), msg="Actor updated successfully.")
def scale_up(actor_id): tenant, aid = actor_id.decode('utf8').split('_') logger.debug( 'METRICS Attempting to create a new worker for {}'.format(actor_id)) try: # create a worker & add to this actor actor = Actor.from_db(actors_store[actor_id]) worker_id = Worker.request_worker(tenant=tenant, actor_id=actor_id) logger.info("New worker id: {}".format(worker_id)) if actor.queue: channel_name = actor.queue else: channel_name = 'default' ch = CommandChannel(name=channel_name) ch.put_cmd(actor_id=actor.db_id, worker_id=worker_id, image=actor.image, tenant=tenant, stop_existing=False) ch.close() logger.debug( 'METRICS Added worker successfully for {}'.format(actor_id)) return channel_name except Exception as e: logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format( type(e), e, e.args)) return None
def post(self, actor_id): """Ensure a certain number of workers are running for an actor""" id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 404) args = self.validate_post() num = args.get('num') if not num or num == 0: num = 1 dbid = Actor.get_dbid(g.tenant, actor_id) workers = Worker.get_workers(dbid) if len(workers.items()) < num: worker_ids = [] num_to_add = int(num) - len(workers.items()) for idx in range(num_to_add): worker_ids.append(Worker.request_worker(actor_id)) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=g.tenant, num=num_to_add, stop_existing=False) return ok( result=None, msg="Scheduled {} new worker(s) to start. There were only". format(num_to_add)) else: return ok(result=None, msg="Actor {} already had {} worker(s).".format( actor_id, num))
def put(self, actor_id): logger.debug("top of PUT /actors/{}".format(actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor {} in store.".format(dbid)) raise ResourceError( "No actor found with id: {}.".format(actor_id), 404) previous_image = actor.image args = self.validate_put(actor) logger.debug("PUT args validated successfully.") args['tenant'] = g.tenant update_image = False if args['image'] == previous_image: logger.debug("new image is the same. not updating actor.") args['status'] = actor.status else: update_image = True args['status'] = SUBMITTED logger.debug("new image is different. updating actor.") args['api_server'] = g.api_server args['owner'] = g.user actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() logger.info("updated actor {} stored in db.".format(actor_id)) worker_ids = Worker.request_worker(actor.db_id) if update_image: ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant']) logger.debug("put new command on command channel to update actor.") return ok(result=actor.display(), msg="Actor updated successfully.")
def post(self, actor_id): """Ensure a certain number of workers are running for an actor""" logger.debug("top of POST /actors/{}/workers.".format(actor_id)) id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) args = self.validate_post() logger.debug( "workers POST params validated. actor: {}.".format(actor_id)) num = args.get('num') if not num or num == 0: logger.debug("did not get a num: {}.".format(actor_id)) num = 1 logger.debug("ensuring at least {} workers. actor: {}.".format( num, actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: workers = Worker.get_workers(dbid) except WorkerException as e: logger.debug( "did not find workers for actor: {}.".format(actor_id)) raise ResourceError(e.msg, 404) current_number_workers = len(workers.items()) if current_number_workers < num: logger.debug( "There were only {} workers for actor: {} so we're adding more." .format(current_number_workers, actor_id)) worker_ids = [] num_to_add = int(num) - len(workers.items()) logger.info("adding {} more workers for actor {}".format( num_to_add, actor_id)) for idx in range(num_to_add): worker_ids.append( Worker.request_worker(tenant=g.tenant, actor_id=actor_id)) logger.info("New worker ids: {}".format(worker_ids)) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=g.tenant, num=num_to_add, stop_existing=False) ch.close() logger.info( "Message put on command channel for new worker ids: {}".format( worker_ids)) return ok( result=None, msg="Scheduled {} new worker(s) to start. There were only". format(num_to_add)) else: return ok(result=None, msg="Actor {} already had {} worker(s).".format( actor_id, num))
def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.cmd_ch = CommandChannel() self.tot_workers = 0 try: self.host_id = Config.get('spawner', 'host_id') except Exception as e: logger.critical("Spawner not configured with a host_id! Aborting! Exception: {}".format(e)) raise e
def ensure_one_worker(self): """This method will check the workers store for the actor and request a new worker if none exist.""" worker_id = Worker.ensure_one_worker(self.db_id) if worker_id: worker_ids = [worker_id] ch = CommandChannel() ch.put_cmd(actor_id=self.db_id, worker_ids=worker_ids, image=self.image, tenant=self.tenant, num=1, stop_existing=False) return worker_ids else: return None
def ensure_one_worker(self): """This method will check the workers store for the actor and request a new worker if none exist.""" logger.debug("top of Actor.ensure_one_worker().") worker_id = Worker.ensure_one_worker(self.db_id) logger.debug("Worker.ensure_one_worker returned worker_id: {}".format(worker_id)) if worker_id: worker_ids = [worker_id] logger.info("Actor.ensure_one_worker() putting message on command channel for worker_id: {}".format( worker_id)) ch = CommandChannel() ch.put_cmd(actor_id=self.db_id, worker_ids=worker_ids, image=self.image, tenant=self.tenant, num=1, stop_existing=False) return worker_ids else: logger.debug("Actor.ensure_one_worker() returning None.") return None
def check_metrics(self, actor_ids): for actor_id in actor_ids: logger.debug("TOP OF CHECK METRICS") query = { 'query': 'message_count_for_actor_{}'.format(actor_id.decode("utf-8").replace('-', '_')), 'time': datetime.datetime.utcnow().isoformat() + "Z" } r = requests.get(PROMETHEUS_URL + '/api/v1/query', params=query) data = json.loads(r.text)['data']['result'] change_rate = 0 try: previous_data = last_metric[actor_id] try: change_rate = int(data[0]['value'][1]) - int(previous_data[0]['value'][1]) except: logger.debug("Could not calculate change rate.") except: logger.info("No previous data yet for new actor {}".format(actor_id)) last_metric.update({actor_id: data}) # Add a worker if message count reaches a given number try: logger.debug("METRICS current message count: {}".format(data[0]['value'][1])) if int(data[0]['value'][1]) >= 1: tenant, aid = actor_id.decode('utf8').split('_') logger.debug('METRICS Attempting to create a new worker for {}'.format(actor_id)) try: # create a worker & add to this actor actor = Actor.from_db(actors_store[actor_id]) worker_ids = [Worker.request_worker(tenant=tenant, actor_id=aid)] logger.info("New worker id: {}".format(worker_ids[0])) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=tenant, num=1, stop_existing=False) ch.close() logger.debug('METRICS Added worker successfully for {}'.format(actor_id)) except Exception as e: logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format(type(e), e, e.args)) elif int(data[0]['value'][1]) <= 1: logger.debug("METRICS made it to scale down block") # Check the number of workers for this actor before deciding to scale down workers = Worker.get_workers(actor_id) logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers))) try: if len(workers) == 1: logger.debug("METRICS only one worker, won't scale down") else: while len(workers) > 0: logger.debug('METRICS made it STATUS check') worker = workers.popitem()[1] logger.debug('METRICS SCALE DOWN current worker: {}'.format(worker['status'])) # check status of the worker is ready if worker['status'] == 'READY': logger.debug("METRICS I MADE IT") # scale down try: shutdown_worker(worker['id']) continue except Exception as e: logger.debug('METRICS ERROR shutting down worker: {} - {} - {}'.format(type(e), e, e.args)) logger.debug('METRICS shut down worker {}'.format(worker['id'])) except IndexError: logger.debug('METRICS only one worker found for actor {}. ' 'Will not scale down'.format(actor_id)) except Exception as e: logger.debug("METRICS SCALE UP FAILED: {}".format(e)) except Exception as e: logger.debug("METRICS - ANOTHER ERROR: {} - {} - {}".format(type(e), e, e.args))
def put(self, actor_id): logger.debug("top of PUT /actors/{}".format(actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor {} in store.".format(dbid)) raise ResourceError( "No actor found with id: {}.".format(actor_id), 404) previous_image = actor.image previous_status = actor.status previous_owner = actor.owner args = self.validate_put(actor) logger.debug("PUT args validated successfully.") args['tenant'] = g.tenant # user can force an update by setting the force param: update_image = args.get('force') if not update_image and args['image'] == previous_image: logger.debug("new image is the same and force was false. not updating actor.") logger.debug("Setting status to the actor's previous status which is: {}".format(previous_status)) args['status'] = previous_status else: update_image = True args['status'] = SUBMITTED logger.debug("new image is different. updating actor.") args['api_server'] = g.api_server # we do not allow a PUT to override the owner in case the PUT is issued by another user args['owner'] = previous_owner use_container_uid = args.get('use_container_uid') if Config.get('web', 'case') == 'camel': use_container_uid = args.get('useContainerUid') try: use_tas = Config.get('workers', 'use_tas_uid') except configparser.NoOptionError: logger.debug("no use_tas_uid config.") use_tas = False if hasattr(use_tas, 'lower'): use_tas = use_tas.lower() == 'true' else: logger.error("use_tas_uid configured but not as a string. use_tas_uid: {}".format(use_tas)) logger.debug("use_tas={}. user_container_uid={}".format(use_tas, use_container_uid)) if use_tas and not use_container_uid: uid, gid, tasdir = get_tas_data(g.user, g.tenant) if uid and gid: args['uid'] = uid args['gid'] = gid if tasdir: args['tasdir'] = tasdir args['mounts'] = get_all_mounts(args) args['last_update_time'] = get_current_utc_time() logger.debug("update args: {}".format(args)) actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() logger.info("updated actor {} stored in db.".format(actor_id)) if update_image: worker_ids = [Worker.request_worker(tenant=g.tenant, actor_id=actor.db_id)] ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant']) ch.close() logger.debug("put new command on command channel to update actor.") # put could have been issued by a user with if not previous_owner == g.user: set_permission(g.user, actor.db_id, UPDATE) return ok(result=actor.display(), msg="Actor updated successfully.")
def create_gauges(actor_ids): logger.debug( "METRICS: Made it to create_gauges; actor_ids: {}".format(actor_ids)) inbox_lengths = {} for actor_id in actor_ids: logger.debug("top of for loop for actor_id: {}".format(actor_id)) try: actor = actors_store[actor_id] except KeyError: logger.error("actor {} does not exist.".format(actor_id)) continue # If the actor doesn't have a gauge, add one if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.error( "got exception trying to create/instantiate the gauge; " "actor {}; exception: {}".format(actor_id, e)) else: # Otherwise, get this actor's existing gauge try: g = message_gauges[actor_id] except Exception as e: logger.info( "got exception trying to instantiate an existing gauge; " "actor: {}: exception:{}".format(actor_id, e)) # Update this actor's command channel metric channel_name = actor.get("queue") queues_list = Config.get('spawner', 'host_queues').replace(' ', '') valid_queues = queues_list.split(',') if not channel_name or channel_name not in valid_queues: channel_name = 'default' # Update this actor's gauge to its current # of messages try: ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8")) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e result = {'messages': len(ch._queue._queue)} inbox_lengths[actor_id.decode("utf-8")] = len(ch._queue._queue) ch.close() g.set(result['messages']) logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) # add a worker gauge for this actor if one does not exist if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.decode("utf-8").replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.decode("utf-8").replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: # Otherwise, get the worker gauge that already exists g = worker_gaueges[actor_id] # Update this actor's worker IDs workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} g.set(result['workers']) ch = CommandChannel(name=channel_name) cmd_length = len(ch._queue._queue) command_gauge.labels(channel_name).set(cmd_length) logger.debug("METRICS COMMAND CHANNEL {} size: {}".format( channel_name, command_gauge)) ch.close() # Return actor_ids so we don't have to query for them again later return actor_ids, inbox_lengths, cmd_length
def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.cmd_ch = CommandChannel()
def create_gauges(actor_ids): """ Creates a Prometheus gauge for each actor id. The gauge is used to track the number of pending messages in the actor's queue. :param actor_ids: list of actors that should be processed. Does not include stateful actors or actors in a shutting down state. :return: """ logger.debug("top of create_gauges; actor_ids: {}".format(actor_ids)) # dictionary mapping actor_ids to their message queue lengths inbox_lengths = {} for actor_id in actor_ids: logger.debug("top of for loop for actor_id: {}".format(actor_id)) # first, make sure the actor still exists in the actor store try: actor = actors_store[actor_id] except KeyError: logger.error( f"actor {actor_id} does not exist in store; continuing to next actor." ) continue # If the actor doesn't have a gauge, add one if actor_id not in message_gauges.keys(): try: g = Gauge( 'message_count_for_actor_{}'.format( actor_id.replace('-', '_')), 'Number of messages for actor {}'.format( actor_id.replace('-', '_'))) message_gauges.update({actor_id: g}) logger.debug('Created gauge {}'.format(g)) except Exception as e: logger.error( "got exception trying to create/instantiate the gauge; " "actor {}; exception: {}".format(actor_id, e)) g = None else: # Otherwise, get this actor's existing gauge try: g = message_gauges[actor_id] except Exception as e: logger.info( "got exception trying to instantiate an existing gauge; " "actor: {}: exception:{}".format(actor_id, e)) g = None # Update this actor's gauge to its current # of messages try: ch = ActorMsgChannel(actor_id=actor_id) msg_length = len(ch._queue._queue) except Exception as e: logger.error( "Exception connecting to ActorMsgChannel: {}".format(e)) raise e ch.close() result = {'messages': msg_length} # add the actor's current message queue length to the inbox_lengths in-memory variable inbox_lengths[actor_id] = msg_length # if we were able to create the gauge, set it to the current message: if g: try: g.set(result['messages']) except Exception as e: logger.error( f"Got exception trying to set the messages on the gauge for actor: {actor_id}; " f"exception: {e}") logger.debug("METRICS: {} messages found for actor: {}.".format( result['messages'], actor_id)) # add a worker gauge for this actor if one does not exist if actor_id not in worker_gaueges.keys(): try: g = Gauge( 'worker_count_for_actor_{}'.format( actor_id.replace('-', '_')), 'Number of workers for actor {}'.format( actor_id.replace('-', '_'))) worker_gaueges.update({actor_id: g}) logger.debug('Created worker gauge {}'.format(g)) except Exception as e: logger.info( "got exception trying to instantiate the Worker Gauge: {}". format(e)) else: # Otherwise, get the worker gauge that already exists g = worker_gaueges[actor_id] # Update this actor's worker IDs workers = Worker.get_workers(actor_id) result = {'workers': len(workers)} try: g.set(result['workers']) except Exception as e: logger.error( f"got exception trying to set the worker gauge for actor {actor_id}; exception: {e}" ) logger.debug( f"METRICS: {result['workers']} workers found for actor: {actor_id}." ) # Update this actor's command channel metric # channel_name = actor.get("queue") # # queues_list = Config.get('spawner', 'host_queues').replace(' ', '') # valid_queues = queues_list.split(',') # # if not channel_name or channel_name not in valid_queues: # channel_name = 'default' # # if not channel_name: # # TODO -- this must be changed. there is no way returning no arguments will result in # # anythng but an exception. The calling function is expecting 3 arguments... # # if we really want to blow up right here we should just raise an appropriate exception. # return # TODO -- this code needs to be fixed. What follows is only a partial fix; what I think we want to do # is set the length of all of the different command channels once at the end of this loop. What was # happening instead was that it was only setting one of the command channel's lengths -- whatever command # channel happened to belong to the last actor in the loop. channel_name = 'default' ch = CommandChannel(name=channel_name) cmd_length = len(ch._queue._queue) command_gauge.labels(channel_name).set(cmd_length) logger.debug( f"METRICS COMMAND CHANNEL {channel_name} size: {command_gauge}") ch.close() # Return actor_ids so we don't have to query for them again later return actor_ids, inbox_lengths, cmd_length