示例#1
0
 def put(self, actor_id):
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[dbid])
     except KeyError:
         raise ResourceError("actor not found: {}'".format(actor_id), 404)
     previous_image = actor.image
     args = self.validate_put(actor)
     args['tenant'] = g.tenant
     update_image = False
     if args['image'] == previous_image:
         args['status'] = actor.status
     else:
         update_image = True
         args['status'] = SUBMITTED
     args['api_server'] = g.api_server
     args['owner'] = g.user
     actor = Actor(**args)
     actors_store[actor.db_id] = actor.to_db()
     worker_ids = Worker.request_worker(actor.db_id)
     if update_image:
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=args['tenant'])
     # return ok(result={'update_image': str(update_image)},
     #           msg="Actor updated successfully.")
     return ok(result=actor.display(), msg="Actor updated successfully.")
示例#2
0
def scale_up(actor_id):
    tenant, aid = actor_id.decode('utf8').split('_')
    logger.debug(
        'METRICS Attempting to create a new worker for {}'.format(actor_id))
    try:
        # create a worker & add to this actor
        actor = Actor.from_db(actors_store[actor_id])
        worker_id = Worker.request_worker(tenant=tenant, actor_id=actor_id)
        logger.info("New worker id: {}".format(worker_id))
        if actor.queue:
            channel_name = actor.queue
        else:
            channel_name = 'default'
        ch = CommandChannel(name=channel_name)
        ch.put_cmd(actor_id=actor.db_id,
                   worker_id=worker_id,
                   image=actor.image,
                   tenant=tenant,
                   stop_existing=False)
        ch.close()
        logger.debug(
            'METRICS Added worker successfully for {}'.format(actor_id))
        return channel_name
    except Exception as e:
        logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format(
            type(e), e, e.args))
        return None
示例#3
0
 def post(self, actor_id):
     """Ensure a certain number of workers are running for an actor"""
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[id])
     except KeyError:
         raise ResourceError("actor not found: {}'".format(actor_id), 404)
     args = self.validate_post()
     num = args.get('num')
     if not num or num == 0:
         num = 1
     dbid = Actor.get_dbid(g.tenant, actor_id)
     workers = Worker.get_workers(dbid)
     if len(workers.items()) < num:
         worker_ids = []
         num_to_add = int(num) - len(workers.items())
         for idx in range(num_to_add):
             worker_ids.append(Worker.request_worker(actor_id))
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=g.tenant,
                    num=num_to_add,
                    stop_existing=False)
         return ok(
             result=None,
             msg="Scheduled {} new worker(s) to start. There were only".
             format(num_to_add))
     else:
         return ok(result=None,
                   msg="Actor {} already had {} worker(s).".format(
                       actor_id, num))
示例#4
0
 def put(self, actor_id):
     logger.debug("top of PUT /actors/{}".format(actor_id))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[dbid])
     except KeyError:
         logger.debug("did not find actor {} in store.".format(dbid))
         raise ResourceError(
             "No actor found with id: {}.".format(actor_id), 404)
     previous_image = actor.image
     args = self.validate_put(actor)
     logger.debug("PUT args validated successfully.")
     args['tenant'] = g.tenant
     update_image = False
     if args['image'] == previous_image:
         logger.debug("new image is the same. not updating actor.")
         args['status'] = actor.status
     else:
         update_image = True
         args['status'] = SUBMITTED
         logger.debug("new image is different. updating actor.")
     args['api_server'] = g.api_server
     args['owner'] = g.user
     actor = Actor(**args)
     actors_store[actor.db_id] = actor.to_db()
     logger.info("updated actor {} stored in db.".format(actor_id))
     worker_ids = Worker.request_worker(actor.db_id)
     if update_image:
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant'])
         logger.debug("put new command on command channel to update actor.")
     return ok(result=actor.display(),
               msg="Actor updated successfully.")
示例#5
0
 def post(self, actor_id):
     """Ensure a certain number of workers are running for an actor"""
     logger.debug("top of POST /actors/{}/workers.".format(actor_id))
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[id])
     except KeyError:
         logger.debug("did not find actor: {}.".format(actor_id))
         raise ResourceError("No actor found with id: {}.".format(actor_id),
                             404)
     args = self.validate_post()
     logger.debug(
         "workers POST params validated. actor: {}.".format(actor_id))
     num = args.get('num')
     if not num or num == 0:
         logger.debug("did not get a num: {}.".format(actor_id))
         num = 1
     logger.debug("ensuring at least {} workers. actor: {}.".format(
         num, actor_id))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         workers = Worker.get_workers(dbid)
     except WorkerException as e:
         logger.debug(
             "did not find workers for actor: {}.".format(actor_id))
         raise ResourceError(e.msg, 404)
     current_number_workers = len(workers.items())
     if current_number_workers < num:
         logger.debug(
             "There were only {} workers for actor: {} so we're adding more."
             .format(current_number_workers, actor_id))
         worker_ids = []
         num_to_add = int(num) - len(workers.items())
         logger.info("adding {} more workers for actor {}".format(
             num_to_add, actor_id))
         for idx in range(num_to_add):
             worker_ids.append(
                 Worker.request_worker(tenant=g.tenant, actor_id=actor_id))
         logger.info("New worker ids: {}".format(worker_ids))
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=g.tenant,
                    num=num_to_add,
                    stop_existing=False)
         ch.close()
         logger.info(
             "Message put on command channel for new worker ids: {}".format(
                 worker_ids))
         return ok(
             result=None,
             msg="Scheduled {} new worker(s) to start. There were only".
             format(num_to_add))
     else:
         return ok(result=None,
                   msg="Actor {} already had {} worker(s).".format(
                       actor_id, num))
示例#6
0
 def __init__(self):
     self.num_workers = int(Config.get('workers', 'init_count'))
     self.secret = os.environ.get('_abaco_secret')
     self.cmd_ch = CommandChannel()
     self.tot_workers = 0
     try:
         self.host_id = Config.get('spawner', 'host_id')
     except Exception as e:
         logger.critical("Spawner not configured with a host_id! Aborting! Exception: {}".format(e))
         raise e
示例#7
0
文件: models.py 项目: mwvaughn/abaco
 def ensure_one_worker(self):
     """This method will check the workers store for the actor and request a new worker if none exist."""
     worker_id = Worker.ensure_one_worker(self.db_id)
     if worker_id:
         worker_ids = [worker_id]
         ch = CommandChannel()
         ch.put_cmd(actor_id=self.db_id,
                    worker_ids=worker_ids,
                    image=self.image,
                    tenant=self.tenant,
                    num=1,
                    stop_existing=False)
         return worker_ids
     else:
         return None
示例#8
0
 def ensure_one_worker(self):
     """This method will check the workers store for the actor and request a new worker if none exist."""
     logger.debug("top of Actor.ensure_one_worker().")
     worker_id = Worker.ensure_one_worker(self.db_id)
     logger.debug("Worker.ensure_one_worker returned worker_id: {}".format(worker_id))
     if worker_id:
         worker_ids = [worker_id]
         logger.info("Actor.ensure_one_worker() putting message on command channel for worker_id: {}".format(
             worker_id))
         ch = CommandChannel()
         ch.put_cmd(actor_id=self.db_id,
                    worker_ids=worker_ids,
                    image=self.image,
                    tenant=self.tenant,
                    num=1,
                    stop_existing=False)
         return worker_ids
     else:
         logger.debug("Actor.ensure_one_worker() returning None.")
         return None
示例#9
0
    def check_metrics(self, actor_ids):
        for actor_id in actor_ids:
            logger.debug("TOP OF CHECK METRICS")

            query = {
                'query': 'message_count_for_actor_{}'.format(actor_id.decode("utf-8").replace('-', '_')),
                'time': datetime.datetime.utcnow().isoformat() + "Z"
            }
            r = requests.get(PROMETHEUS_URL + '/api/v1/query', params=query)
            data = json.loads(r.text)['data']['result']

            change_rate = 0
            try:
                previous_data = last_metric[actor_id]
                try:
                    change_rate = int(data[0]['value'][1]) - int(previous_data[0]['value'][1])
                except:
                    logger.debug("Could not calculate change rate.")
            except:
                logger.info("No previous data yet for new actor {}".format(actor_id))

            last_metric.update({actor_id: data})
            # Add a worker if message count reaches a given number
            try:
                logger.debug("METRICS current message count: {}".format(data[0]['value'][1]))
                if int(data[0]['value'][1]) >= 1:
                    tenant, aid = actor_id.decode('utf8').split('_')
                    logger.debug('METRICS Attempting to create a new worker for {}'.format(actor_id))
                    try:
                        # create a worker & add to this actor
                        actor = Actor.from_db(actors_store[actor_id])
                        worker_ids = [Worker.request_worker(tenant=tenant, actor_id=aid)]
                        logger.info("New worker id: {}".format(worker_ids[0]))
                        ch = CommandChannel()
                        ch.put_cmd(actor_id=actor.db_id,
                                   worker_ids=worker_ids,
                                   image=actor.image,
                                   tenant=tenant,
                                   num=1,
                                   stop_existing=False)
                        ch.close()
                        logger.debug('METRICS Added worker successfully for {}'.format(actor_id))
                    except Exception as e:
                        logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format(type(e), e, e.args))
                elif int(data[0]['value'][1]) <= 1:
                    logger.debug("METRICS made it to scale down block")
                    # Check the number of workers for this actor before deciding to scale down
                    workers = Worker.get_workers(actor_id)
                    logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers)))
                    try:
                        if len(workers) == 1:
                            logger.debug("METRICS only one worker, won't scale down")
                        else:
                            while len(workers) > 0:
                                logger.debug('METRICS made it STATUS check')
                                worker = workers.popitem()[1]
                                logger.debug('METRICS SCALE DOWN current worker: {}'.format(worker['status']))
                                # check status of the worker is ready
                                if worker['status'] == 'READY':
                                    logger.debug("METRICS I MADE IT")
                                    # scale down
                                    try:
                                        shutdown_worker(worker['id'])
                                        continue
                                    except Exception as e:
                                        logger.debug('METRICS ERROR shutting down worker: {} - {} - {}'.format(type(e), e, e.args))
                                    logger.debug('METRICS shut down worker {}'.format(worker['id']))

                    except IndexError:
                        logger.debug('METRICS only one worker found for actor {}. '
                                     'Will not scale down'.format(actor_id))
                    except Exception as e:
                        logger.debug("METRICS SCALE UP FAILED: {}".format(e))


            except Exception as e:
                logger.debug("METRICS - ANOTHER ERROR: {} - {} - {}".format(type(e), e, e.args))
示例#10
0
    def put(self, actor_id):
        logger.debug("top of PUT /actors/{}".format(actor_id))
        dbid = Actor.get_dbid(g.tenant, actor_id)
        try:
            actor = Actor.from_db(actors_store[dbid])
        except KeyError:
            logger.debug("did not find actor {} in store.".format(dbid))
            raise ResourceError(
                "No actor found with id: {}.".format(actor_id), 404)
        previous_image = actor.image
        previous_status = actor.status
        previous_owner = actor.owner
        args = self.validate_put(actor)
        logger.debug("PUT args validated successfully.")
        args['tenant'] = g.tenant
        # user can force an update by setting the force param:
        update_image = args.get('force')
        if not update_image and args['image'] == previous_image:
            logger.debug("new image is the same and force was false. not updating actor.")
            logger.debug("Setting status to the actor's previous status which is: {}".format(previous_status))
            args['status'] = previous_status
        else:
            update_image = True
            args['status'] = SUBMITTED
            logger.debug("new image is different. updating actor.")
        args['api_server'] = g.api_server

        # we do not allow a PUT to override the owner in case the PUT is issued by another user
        args['owner'] = previous_owner
        use_container_uid = args.get('use_container_uid')
        if Config.get('web', 'case') == 'camel':
            use_container_uid = args.get('useContainerUid')
        try:
            use_tas = Config.get('workers', 'use_tas_uid')
        except configparser.NoOptionError:
            logger.debug("no use_tas_uid config.")
            use_tas = False
        if hasattr(use_tas, 'lower'):
            use_tas = use_tas.lower() == 'true'
        else:
            logger.error("use_tas_uid configured but not as a string. use_tas_uid: {}".format(use_tas))
        logger.debug("use_tas={}. user_container_uid={}".format(use_tas, use_container_uid))
        if use_tas and not use_container_uid:
            uid, gid, tasdir = get_tas_data(g.user, g.tenant)
            if uid and gid:
                args['uid'] = uid
                args['gid'] = gid
            if tasdir:
                args['tasdir'] = tasdir
        args['mounts'] = get_all_mounts(args)
        args['last_update_time'] = get_current_utc_time()
        logger.debug("update args: {}".format(args))
        actor = Actor(**args)
        actors_store[actor.db_id] = actor.to_db()
        logger.info("updated actor {} stored in db.".format(actor_id))
        if update_image:
            worker_ids = [Worker.request_worker(tenant=g.tenant, actor_id=actor.db_id)]
            ch = CommandChannel()
            ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant'])
            ch.close()
            logger.debug("put new command on command channel to update actor.")
        # put could have been issued by a user with
        if not previous_owner == g.user:
            set_permission(g.user, actor.db_id, UPDATE)
        return ok(result=actor.display(),
                  msg="Actor updated successfully.")
示例#11
0
def create_gauges(actor_ids):
    logger.debug(
        "METRICS: Made it to create_gauges; actor_ids: {}".format(actor_ids))
    inbox_lengths = {}
    for actor_id in actor_ids:
        logger.debug("top of for loop for actor_id: {}".format(actor_id))

        try:
            actor = actors_store[actor_id]
        except KeyError:
            logger.error("actor {} does not exist.".format(actor_id))
            continue

            # If the actor doesn't have a gauge, add one
        if actor_id not in message_gauges.keys():
            try:
                g = Gauge(
                    'message_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of messages for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                message_gauges.update({actor_id: g})
                logger.debug('Created gauge {}'.format(g))
            except Exception as e:
                logger.error(
                    "got exception trying to create/instantiate the gauge; "
                    "actor {}; exception: {}".format(actor_id, e))
        else:
            # Otherwise, get this actor's existing gauge
            try:
                g = message_gauges[actor_id]
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate an existing gauge; "
                    "actor: {}: exception:{}".format(actor_id, e))

            # Update this actor's command channel metric
            channel_name = actor.get("queue")

            queues_list = Config.get('spawner', 'host_queues').replace(' ', '')
            valid_queues = queues_list.split(',')

            if not channel_name or channel_name not in valid_queues:
                channel_name = 'default'

        # Update this actor's gauge to its current # of messages
        try:
            ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8"))
        except Exception as e:
            logger.error(
                "Exception connecting to ActorMsgChannel: {}".format(e))
            raise e
        result = {'messages': len(ch._queue._queue)}
        inbox_lengths[actor_id.decode("utf-8")] = len(ch._queue._queue)
        ch.close()
        g.set(result['messages'])
        logger.debug("METRICS: {} messages found for actor: {}.".format(
            result['messages'], actor_id))

        # add a worker gauge for this actor if one does not exist
        if actor_id not in worker_gaueges.keys():
            try:
                g = Gauge(
                    'worker_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of workers for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                worker_gaueges.update({actor_id: g})
                logger.debug('Created worker gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Worker Gauge: {}".
                    format(e))
        else:
            # Otherwise, get the worker gauge that already exists
            g = worker_gaueges[actor_id]

        # Update this actor's worker IDs
        workers = Worker.get_workers(actor_id)
        result = {'workers': len(workers)}
        g.set(result['workers'])

    ch = CommandChannel(name=channel_name)
    cmd_length = len(ch._queue._queue)
    command_gauge.labels(channel_name).set(cmd_length)
    logger.debug("METRICS COMMAND CHANNEL {} size: {}".format(
        channel_name, command_gauge))
    ch.close()

    # Return actor_ids so we don't have to query for them again later
    return actor_ids, inbox_lengths, cmd_length
示例#12
0
 def __init__(self):
     self.num_workers = int(Config.get('workers', 'init_count'))
     self.secret = os.environ.get('_abaco_secret')
     self.cmd_ch = CommandChannel()
示例#13
0
def create_gauges(actor_ids):
    """
    Creates a Prometheus gauge for each actor id. The gauge is used to track the number of
    pending messages in the actor's queue.
    :param actor_ids: list of actors that should be processed. Does not include stateful actors or
    actors in a shutting down state.
    :return:
    """
    logger.debug("top of create_gauges; actor_ids: {}".format(actor_ids))
    # dictionary mapping actor_ids to their message queue lengths
    inbox_lengths = {}
    for actor_id in actor_ids:
        logger.debug("top of for loop for actor_id: {}".format(actor_id))
        # first, make sure the actor still exists in the actor store
        try:
            actor = actors_store[actor_id]
        except KeyError:
            logger.error(
                f"actor {actor_id} does not exist in store; continuing to next actor."
            )
            continue
        # If the actor doesn't have a gauge, add one
        if actor_id not in message_gauges.keys():
            try:
                g = Gauge(
                    'message_count_for_actor_{}'.format(
                        actor_id.replace('-', '_')),
                    'Number of messages for actor {}'.format(
                        actor_id.replace('-', '_')))
                message_gauges.update({actor_id: g})
                logger.debug('Created gauge {}'.format(g))
            except Exception as e:
                logger.error(
                    "got exception trying to create/instantiate the gauge; "
                    "actor {}; exception: {}".format(actor_id, e))
                g = None
        else:
            # Otherwise, get this actor's existing gauge
            try:
                g = message_gauges[actor_id]
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate an existing gauge; "
                    "actor: {}: exception:{}".format(actor_id, e))
                g = None
        # Update this actor's gauge to its current # of messages
        try:
            ch = ActorMsgChannel(actor_id=actor_id)
            msg_length = len(ch._queue._queue)
        except Exception as e:
            logger.error(
                "Exception connecting to ActorMsgChannel: {}".format(e))
            raise e
        ch.close()
        result = {'messages': msg_length}
        # add the actor's current message queue length to the inbox_lengths in-memory variable
        inbox_lengths[actor_id] = msg_length
        # if we were able to create the gauge, set it to the current message:
        if g:
            try:
                g.set(result['messages'])
            except Exception as e:
                logger.error(
                    f"Got exception trying to set the messages on the gauge for actor: {actor_id}; "
                    f"exception: {e}")
        logger.debug("METRICS: {} messages found for actor: {}.".format(
            result['messages'], actor_id))

        # add a worker gauge for this actor if one does not exist
        if actor_id not in worker_gaueges.keys():
            try:
                g = Gauge(
                    'worker_count_for_actor_{}'.format(
                        actor_id.replace('-', '_')),
                    'Number of workers for actor {}'.format(
                        actor_id.replace('-', '_')))
                worker_gaueges.update({actor_id: g})
                logger.debug('Created worker gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Worker Gauge: {}".
                    format(e))
        else:
            # Otherwise, get the worker gauge that already exists
            g = worker_gaueges[actor_id]

        # Update this actor's worker IDs
        workers = Worker.get_workers(actor_id)
        result = {'workers': len(workers)}
        try:
            g.set(result['workers'])
        except Exception as e:
            logger.error(
                f"got exception trying to set the worker gauge for actor {actor_id}; exception: {e}"
            )
        logger.debug(
            f"METRICS: {result['workers']} workers found for actor: {actor_id}."
        )

        # Update this actor's command channel metric
        # channel_name = actor.get("queue")
        #
        # queues_list = Config.get('spawner', 'host_queues').replace(' ', '')
        # valid_queues = queues_list.split(',')
        #
        # if not channel_name or channel_name not in valid_queues:
        #     channel_name = 'default'
        #
        # if not channel_name:
        #     # TODO -- this must be changed. there is no way returning no arguments will result in
        #     # anythng but an exception. The calling function is expecting 3 arguments...
        #     # if we really want to blow up right here we should just raise an appropriate exception.
        #     return

    # TODO -- this code needs to be fixed. What follows is only a partial fix; what I think we want to do
    # is set the length of all of the different command channels once at the end of this loop. What was
    # happening instead was that it was only setting one of the command channel's lengths -- whatever command
    # channel happened to belong to the last actor in the loop.
    channel_name = 'default'
    ch = CommandChannel(name=channel_name)
    cmd_length = len(ch._queue._queue)
    command_gauge.labels(channel_name).set(cmd_length)
    logger.debug(
        f"METRICS COMMAND CHANNEL {channel_name} size: {command_gauge}")
    ch.close()

    # Return actor_ids so we don't have to query for them again later
    return actor_ids, inbox_lengths, cmd_length