Exemplo n.º 1
0
 def put(self, actor_id):
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[dbid])
     except KeyError:
         raise ResourceError("actor not found: {}'".format(actor_id), 404)
     previous_image = actor.image
     args = self.validate_put(actor)
     args['tenant'] = g.tenant
     update_image = False
     if args['image'] == previous_image:
         args['status'] = actor.status
     else:
         update_image = True
         args['status'] = SUBMITTED
     args['api_server'] = g.api_server
     args['owner'] = g.user
     actor = Actor(**args)
     actors_store[actor.db_id] = actor.to_db()
     worker_ids = Worker.request_worker(actor.db_id)
     if update_image:
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=args['tenant'])
     # return ok(result={'update_image': str(update_image)},
     #           msg="Actor updated successfully.")
     return ok(result=actor.display(), msg="Actor updated successfully.")
Exemplo n.º 2
0
 def post(self, actor_id):
     """Ensure a certain number of workers are running for an actor"""
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[id])
     except KeyError:
         raise ResourceError("actor not found: {}'".format(actor_id), 404)
     args = self.validate_post()
     num = args.get('num')
     if not num or num == 0:
         num = 1
     dbid = Actor.get_dbid(g.tenant, actor_id)
     workers = Worker.get_workers(dbid)
     if len(workers.items()) < num:
         worker_ids = []
         num_to_add = int(num) - len(workers.items())
         for idx in range(num_to_add):
             worker_ids.append(Worker.request_worker(actor_id))
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=g.tenant,
                    num=num_to_add,
                    stop_existing=False)
         return ok(
             result=None,
             msg="Scheduled {} new worker(s) to start. There were only".
             format(num_to_add))
     else:
         return ok(result=None,
                   msg="Actor {} already had {} worker(s).".format(
                       actor_id, num))
Exemplo n.º 3
0
 def put(self, actor_id):
     logger.debug("top of PUT /actors/{}".format(actor_id))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[dbid])
     except KeyError:
         logger.debug("did not find actor {} in store.".format(dbid))
         raise ResourceError(
             "No actor found with id: {}.".format(actor_id), 404)
     previous_image = actor.image
     args = self.validate_put(actor)
     logger.debug("PUT args validated successfully.")
     args['tenant'] = g.tenant
     update_image = False
     if args['image'] == previous_image:
         logger.debug("new image is the same. not updating actor.")
         args['status'] = actor.status
     else:
         update_image = True
         args['status'] = SUBMITTED
         logger.debug("new image is different. updating actor.")
     args['api_server'] = g.api_server
     args['owner'] = g.user
     actor = Actor(**args)
     actors_store[actor.db_id] = actor.to_db()
     logger.info("updated actor {} stored in db.".format(actor_id))
     worker_ids = Worker.request_worker(actor.db_id)
     if update_image:
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant'])
         logger.debug("put new command on command channel to update actor.")
     return ok(result=actor.display(),
               msg="Actor updated successfully.")
Exemplo n.º 4
0
 def put(self, actor_id):
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[dbid])
     except KeyError:
         raise APIException(
             "actor not found: {}'".format(actor_id), 404)
     previous_image = actor.image
     args = self.validate_put(actor)
     args['tenant'] = g.tenant
     update_image = False
     if args['image'] == previous_image:
         args['status'] = actor.status
     else:
         update_image = True
         args['status'] = SUBMITTED
     args['api_server'] = g.api_server
     args['owner'] = g.user
     actor = Actor(**args)
     actors_store[actor.db_id] = actor.to_db()
     if update_image:
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id, image=actor.image, tenant=args['tenant'])
     # return ok(result={'update_image': str(update_image)},
     #           msg="Actor updated successfully.")
     return ok(result=actor.display(),
               msg="Actor updated successfully.")
Exemplo n.º 5
0
 def __init__(self):
     self.num_workers = int(Config.get('workers', 'init_count'))
     self.secret = os.environ.get('_abaco_secret')
     self.cmd_ch = CommandChannel()
     self.tot_workers = 0
     try:
         self.host_id = Config.get('spawner', 'host_id')
     except Exception as e:
         logger.critical("Spawner not configured with a host_id! Aborting! Exception: {}".format(e))
         raise e
Exemplo n.º 6
0
 def post(self):
     args = self.validate_post()
     args['executions'] = {}
     args['state'] = ''
     args['subscriptions'] = []
     args['status'] = SUBMITTED
     actor = Actor(args)
     actors_store[actor.id] = actor.to_db()
     ch = CommandChannel()
     ch.put_cmd(actor_id=actor.id, image=actor.image)
     return ok(result=actor, msg="Actor created successfully.")
Exemplo n.º 7
0
 def post(self):
     args = self.validate_post()
     args['tenant'] = g.tenant
     args['api_server'] = g.api_server
     args['owner'] = g.user
     actor = Actor(**args)
     actors_store[actor.db_id] = actor.to_db()
     ch = CommandChannel()
     ch.put_cmd(actor_id=actor.db_id, image=actor.image, tenant=args['tenant'])
     add_permission(g.user, actor.db_id, 'UPDATE')
     return ok(result=actor.display(), msg="Actor created successfully.", request=request)
Exemplo n.º 8
0
 def post(self, actor_id):
     """Start new workers for an actor"""
     try:
         actor = Actor.from_db(actors_store[actor_id])
     except KeyError:
         raise APIException("actor not found: {}'".format(actor_id), 404)
     args = self.validate_post()
     num = args.get("num")
     if not num or num == 0:
         num = 1
     ch = CommandChannel()
     ch.put_cmd(actor_id=actor.id, image=actor.image, num=num, stop_existing=False)
     return ok(result=None, msg="Scheduled {} new worker(s) to start.".format(str(num)))
Exemplo n.º 9
0
 def ensure_one_worker(self):
     """This method will check the workers store for the actor and request a new worker if none exist."""
     worker_id = Worker.ensure_one_worker(self.db_id)
     if worker_id:
         worker_ids = [worker_id]
         ch = CommandChannel()
         ch.put_cmd(actor_id=self.db_id,
                    worker_ids=worker_ids,
                    image=self.image,
                    tenant=self.tenant,
                    num=1,
                    stop_existing=False)
         return worker_ids
     else:
         return None
Exemplo n.º 10
0
 def post(self, actor_id):
     """Ensure a certain number of workers are running for an actor"""
     logger.debug("top of POST /actors/{}/workers.".format(actor_id))
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[id])
     except KeyError:
         logger.debug("did not find actor: {}.".format(actor_id))
         raise ResourceError("No actor found with id: {}.".format(actor_id), 404)
     args = self.validate_post()
     logger.debug("workers POST params validated. actor: {}.".format(actor_id))
     num = args.get('num')
     if not num or num == 0:
         logger.debug("did not get a num: {}.".format(actor_id))
         num = 1
     logger.debug("ensuring at least {} workers. actor: {}.".format(num, actor_id))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         workers = Worker.get_workers(dbid)
     except WorkerException as e:
         logger.debug("did not find workers for actor: {}.".format(actor_id))
         raise ResourceError(e.msg, 404)
     current_number_workers = len(workers.items())
     if current_number_workers < num:
         logger.debug("There were only {} workers for actor: {} so we're adding more.".format(current_number_workers,
                                                                                              actor_id))
         worker_ids = []
         num_to_add = int(num) - len(workers.items())
         logger.info("adding {} more workers for actor {}".format(num_to_add, actor_id))
         for idx in range(num_to_add):
             worker_ids.append(Worker.request_worker(actor_id))
         logger.info("New worker ids: {}".format(worker_ids))
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=g.tenant,
                    num=num_to_add,
                    stop_existing=False)
         logger.info("Message put on command channel for new worker ids: {}".format(worker_ids))
         return ok(result=None, msg="Scheduled {} new worker(s) to start. There were only".format(num_to_add))
     else:
         return ok(result=None, msg="Actor {} already had {} worker(s).".format(actor_id, num))
Exemplo n.º 11
0
 def ensure_one_worker(self):
     """This method will check the workers store for the actor and request a new worker if none exist."""
     logger.debug("top of Actor.ensure_one_worker().")
     worker_id = Worker.ensure_one_worker(self.db_id)
     logger.debug("Worker.ensure_one_worker returned worker_id: {}".format(worker_id))
     if worker_id:
         worker_ids = [worker_id]
         logger.info("Actor.ensure_one_worker() putting message on command channel for worker_id: {}".format(
             worker_id))
         ch = CommandChannel()
         ch.put_cmd(actor_id=self.db_id,
                    worker_ids=worker_ids,
                    image=self.image,
                    tenant=self.tenant,
                    num=1,
                    stop_existing=False)
         return worker_ids
     else:
         logger.debug("Actor.ensure_one_worker() returning None.")
         return None
Exemplo n.º 12
0
def scale_up(actor_id):
    tenant, aid = actor_id.decode('utf8').split('_')
    logger.debug(
        'METRICS Attempting to create a new worker for {}'.format(actor_id))
    try:
        # create a worker & add to this actor
        actor = Actor.from_db(actors_store[actor_id])
        worker_id = Worker.request_worker(tenant=tenant, actor_id=actor_id)
        logger.info("New worker id: {}".format(worker_id))
        if actor.queue:
            channel_name = actor.queue
        else:
            channel_name = 'default'
        ch = CommandChannel(name=channel_name)
        ch.put_cmd(actor_id=actor.db_id,
                   worker_id=worker_id,
                   image=actor.image,
                   tenant=tenant,
                   stop_existing=False)
        ch.close()
        logger.debug(
            'METRICS Added worker successfully for {}'.format(actor_id))
        return channel_name
    except Exception as e:
        logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format(
            type(e), e, e.args))
        return None
Exemplo n.º 13
0
 def put(self, actor_id):
     try:
         actor = Actor.from_db(actors_store[actor_id])
     except KeyError:
         raise APIException(
             "actor not found: {}'".format(actor_id), 404)
     args = self.validate_put()
     update_image = False
     args['name'] = actor['name']
     args['id'] = actor['id']
     args['executions'] = actor['executions']
     args['state'] = actor['state']
     if args['image'] == actor.image:
         args['status'] = actor.status
     else:
         update_image = True
         args['status'] = SUBMITTED
     actor = Actor(args)
     actors_store[actor.id] = actor.to_db()
     if update_image:
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.id, image=actor.image)
     return ok(result=actor, msg="Actor updated successfully.")
Exemplo n.º 14
0
    def check_metrics(self, actor_ids):
        for actor_id in actor_ids:
            logger.debug("TOP OF CHECK METRICS")

            query = {
                'query': 'message_count_for_actor_{}'.format(actor_id.decode("utf-8").replace('-', '_')),
                'time': datetime.datetime.utcnow().isoformat() + "Z"
            }
            r = requests.get(PROMETHEUS_URL + '/api/v1/query', params=query)
            data = json.loads(r.text)['data']['result']

            change_rate = 0
            try:
                previous_data = last_metric[actor_id]
                try:
                    change_rate = int(data[0]['value'][1]) - int(previous_data[0]['value'][1])
                except:
                    logger.debug("Could not calculate change rate.")
            except:
                logger.info("No previous data yet for new actor {}".format(actor_id))

            last_metric.update({actor_id: data})
            # Add a worker if message count reaches a given number
            try:
                logger.debug("METRICS current message count: {}".format(data[0]['value'][1]))
                if int(data[0]['value'][1]) >= 1:
                    tenant, aid = actor_id.decode('utf8').split('_')
                    logger.debug('METRICS Attempting to create a new worker for {}'.format(actor_id))
                    try:
                        # create a worker & add to this actor
                        actor = Actor.from_db(actors_store[actor_id])
                        worker_ids = [Worker.request_worker(tenant=tenant, actor_id=aid)]
                        logger.info("New worker id: {}".format(worker_ids[0]))
                        ch = CommandChannel()
                        ch.put_cmd(actor_id=actor.db_id,
                                   worker_ids=worker_ids,
                                   image=actor.image,
                                   tenant=tenant,
                                   num=1,
                                   stop_existing=False)
                        ch.close()
                        logger.debug('METRICS Added worker successfully for {}'.format(actor_id))
                    except Exception as e:
                        logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format(type(e), e, e.args))
                elif int(data[0]['value'][1]) <= 1:
                    logger.debug("METRICS made it to scale down block")
                    # Check the number of workers for this actor before deciding to scale down
                    workers = Worker.get_workers(actor_id)
                    logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers)))
                    try:
                        if len(workers) == 1:
                            logger.debug("METRICS only one worker, won't scale down")
                        else:
                            while len(workers) > 0:
                                logger.debug('METRICS made it STATUS check')
                                worker = workers.popitem()[1]
                                logger.debug('METRICS SCALE DOWN current worker: {}'.format(worker['status']))
                                # check status of the worker is ready
                                if worker['status'] == 'READY':
                                    logger.debug("METRICS I MADE IT")
                                    # scale down
                                    try:
                                        shutdown_worker(worker['id'])
                                        continue
                                    except Exception as e:
                                        logger.debug('METRICS ERROR shutting down worker: {} - {} - {}'.format(type(e), e, e.args))
                                    logger.debug('METRICS shut down worker {}'.format(worker['id']))

                    except IndexError:
                        logger.debug('METRICS only one worker found for actor {}. '
                                     'Will not scale down'.format(actor_id))
                    except Exception as e:
                        logger.debug("METRICS SCALE UP FAILED: {}".format(e))


            except Exception as e:
                logger.debug("METRICS - ANOTHER ERROR: {} - {} - {}".format(type(e), e, e.args))
Exemplo n.º 15
0
    def put(self, actor_id):
        logger.debug("top of PUT /actors/{}".format(actor_id))
        dbid = Actor.get_dbid(g.tenant, actor_id)
        try:
            actor = Actor.from_db(actors_store[dbid])
        except KeyError:
            logger.debug("did not find actor {} in store.".format(dbid))
            raise ResourceError(
                "No actor found with id: {}.".format(actor_id), 404)
        previous_image = actor.image
        previous_status = actor.status
        previous_owner = actor.owner
        args = self.validate_put(actor)
        logger.debug("PUT args validated successfully.")
        args['tenant'] = g.tenant
        # user can force an update by setting the force param:
        update_image = args.get('force')
        if not update_image and args['image'] == previous_image:
            logger.debug("new image is the same and force was false. not updating actor.")
            logger.debug("Setting status to the actor's previous status which is: {}".format(previous_status))
            args['status'] = previous_status
        else:
            update_image = True
            args['status'] = SUBMITTED
            logger.debug("new image is different. updating actor.")
        args['api_server'] = g.api_server

        # we do not allow a PUT to override the owner in case the PUT is issued by another user
        args['owner'] = previous_owner
        use_container_uid = args.get('use_container_uid')
        if Config.get('web', 'case') == 'camel':
            use_container_uid = args.get('useContainerUid')
        try:
            use_tas = Config.get('workers', 'use_tas_uid')
        except configparser.NoOptionError:
            logger.debug("no use_tas_uid config.")
            use_tas = False
        if hasattr(use_tas, 'lower'):
            use_tas = use_tas.lower() == 'true'
        else:
            logger.error("use_tas_uid configured but not as a string. use_tas_uid: {}".format(use_tas))
        logger.debug("use_tas={}. user_container_uid={}".format(use_tas, use_container_uid))
        if use_tas and not use_container_uid:
            uid, gid, tasdir = get_tas_data(g.user, g.tenant)
            if uid and gid:
                args['uid'] = uid
                args['gid'] = gid
            if tasdir:
                args['tasdir'] = tasdir
        args['mounts'] = get_all_mounts(args)
        args['last_update_time'] = get_current_utc_time()
        logger.debug("update args: {}".format(args))
        actor = Actor(**args)
        actors_store[actor.db_id] = actor.to_db()
        logger.info("updated actor {} stored in db.".format(actor_id))
        if update_image:
            worker_ids = [Worker.request_worker(tenant=g.tenant, actor_id=actor.db_id)]
            ch = CommandChannel()
            ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant'])
            ch.close()
            logger.debug("put new command on command channel to update actor.")
        # put could have been issued by a user with
        if not previous_owner == g.user:
            set_permission(g.user, actor.db_id, UPDATE)
        return ok(result=actor.display(),
                  msg="Actor updated successfully.")
Exemplo n.º 16
0
 def __init__(self):
     self.num_workers = int(Config.get('workers', 'init_count'))
     self.cmd_ch = CommandChannel()
Exemplo n.º 17
0
class Spawner(object):

    def __init__(self):
        self.num_workers = int(Config.get('workers', 'init_count'))
        self.cmd_ch = CommandChannel()

    def run(self):
        while True:
            cmd = self.cmd_ch.get()
            self.process(cmd)

    def stop_workers(self, actor_id):
        """Stop existing workers; used when updating an actor's image."""

        try:
            workers = json.loads(workers_store[actor_id])
            print("Found existing workers: {}".format(str(workers)))
        except KeyError:
            print("No existing workers.")
            workers = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers) > 0 :
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()

            # now, send messages to workers for a graceful shutdown:
            for worker in workers:
                ch = WorkerChannel(name=worker['ch_name'])
                ch.put('stop')


    def process(self, cmd):
        print("Processing cmd:{}".format(str(cmd)))
        actor_id = cmd['actor_id']
        image = cmd['image']
        stop_existing = cmd.get('stop_existing', True)
        num_workers = cmd.get('num', self.num_workers)
        print("Actor id:{}".format(actor_id))
        try:
            new_channels, anon_channels, new_workers = self.start_workers(actor_id, image, num_workers)
        except SpawnerException as e:
            # for now, start_workers will do clean up for a SpawnerException, so we just need
            # to return back to the run loop.
            return
        print("Created new workers: {}".format(str(new_workers)))

        # stop any existing workers:
        if stop_existing:
            self.stop_workers(actor_id)

        # tell new workers to subscribe to the actor channel.
        for channel in anon_channels:
            channel.put({'status': 'ok', 'actor_id': actor_id})

        if not stop_existing:
            workers = json.loads(workers_store[actor_id])
            workers.extend(new_workers)
            workers_store[actor_id] = json.dumps(workers)
        else:
            workers_store[actor_id] = json.dumps(new_workers)

    def start_workers(self, actor_id, image, num_workers):
        print("starting {} workers. actor_id: {} image: {}".format(str(self.num_workers), actor_id, image))
        channels = []
        anon_channels = []
        workers = []
        try:
            for i in range(num_workers):
                print("starting worker {}".format(str(i)))
                ch, anon_ch, worker = self.start_worker(image)
                print("channel for worker {} is: {}".format(str(i), ch._name))
                channels.append(ch)
                anon_channels.append(anon_ch)
                workers.append(worker)
        except SpawnerException as e:
            print("Caught SpawnerException:{}".format(str(e)))
            # in case of an error, put the actor in error state and kill all workers
            Actor.set_status(actor_id, ERROR)
            for worker in workers:
                try:
                    self.kill_worker(worker)
                except DockerError as e:
                    print("Received DockerError trying to kill worker: {}".format(str(e)))
            raise SpawnerException()
        return channels, anon_channels, workers

    def start_worker(self, image):
        ch = WorkerChannel()
        # start an actor executor container and wait for a confirmation that image was pulled.
        worker = run_worker(image, ch._name)
        print("worker started successfully, waiting on ack that image was pulled...")
        result = ch.get()
        if result['value']['status'] == 'ok':
            print("received ack from worker.")
            return ch, result['reply_to'], worker
        else:
            print("Got an error status from worker: {}. Raising an exception.".format(str(result)))
            raise SpawnerException()

    def kill_worker(self, worker):
        pass
Exemplo n.º 18
0
 def __init__(self):
     self.num_workers = int(Config.get('workers', 'init_count'))
     self.secret = os.environ.get('_abaco_secret')
     self.cmd_ch = CommandChannel()
Exemplo n.º 19
0
 def __init__(self):
     self.num_workers = int(Config.get("workers", "init_count"))
     self.cmd_ch = CommandChannel()
Exemplo n.º 20
0
def create_gauges(actor_ids):
    """
    Creates a Prometheus gauge for each actor id. The gauge is used to track the number of
    pending messages in the actor's queue.
    :param actor_ids: list of actors that should be processed. Does not include stateful actors or
    actors in a shutting down state.
    :return:
    """
    logger.debug("top of create_gauges; actor_ids: {}".format(actor_ids))
    # dictionary mapping actor_ids to their message queue lengths
    inbox_lengths = {}
    for actor_id in actor_ids:
        logger.debug("top of for loop for actor_id: {}".format(actor_id))
        # first, make sure the actor still exists in the actor store
        try:
            actor = actors_store[actor_id]
        except KeyError:
            logger.error(
                f"actor {actor_id} does not exist in store; continuing to next actor."
            )
            continue
        # If the actor doesn't have a gauge, add one
        if actor_id not in message_gauges.keys():
            try:
                g = Gauge(
                    'message_count_for_actor_{}'.format(
                        actor_id.replace('-', '_')),
                    'Number of messages for actor {}'.format(
                        actor_id.replace('-', '_')))
                message_gauges.update({actor_id: g})
                logger.debug('Created gauge {}'.format(g))
            except Exception as e:
                logger.error(
                    "got exception trying to create/instantiate the gauge; "
                    "actor {}; exception: {}".format(actor_id, e))
                g = None
        else:
            # Otherwise, get this actor's existing gauge
            try:
                g = message_gauges[actor_id]
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate an existing gauge; "
                    "actor: {}: exception:{}".format(actor_id, e))
                g = None
        # Update this actor's gauge to its current # of messages
        try:
            ch = ActorMsgChannel(actor_id=actor_id)
            msg_length = len(ch._queue._queue)
        except Exception as e:
            logger.error(
                "Exception connecting to ActorMsgChannel: {}".format(e))
            raise e
        ch.close()
        result = {'messages': msg_length}
        # add the actor's current message queue length to the inbox_lengths in-memory variable
        inbox_lengths[actor_id] = msg_length
        # if we were able to create the gauge, set it to the current message:
        if g:
            try:
                g.set(result['messages'])
            except Exception as e:
                logger.error(
                    f"Got exception trying to set the messages on the gauge for actor: {actor_id}; "
                    f"exception: {e}")
        logger.debug("METRICS: {} messages found for actor: {}.".format(
            result['messages'], actor_id))

        # add a worker gauge for this actor if one does not exist
        if actor_id not in worker_gaueges.keys():
            try:
                g = Gauge(
                    'worker_count_for_actor_{}'.format(
                        actor_id.replace('-', '_')),
                    'Number of workers for actor {}'.format(
                        actor_id.replace('-', '_')))
                worker_gaueges.update({actor_id: g})
                logger.debug('Created worker gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Worker Gauge: {}".
                    format(e))
        else:
            # Otherwise, get the worker gauge that already exists
            g = worker_gaueges[actor_id]

        # Update this actor's worker IDs
        workers = Worker.get_workers(actor_id)
        result = {'workers': len(workers)}
        try:
            g.set(result['workers'])
        except Exception as e:
            logger.error(
                f"got exception trying to set the worker gauge for actor {actor_id}; exception: {e}"
            )
        logger.debug(
            f"METRICS: {result['workers']} workers found for actor: {actor_id}."
        )

        # Update this actor's command channel metric
        # channel_name = actor.get("queue")
        #
        # queues_list = Config.get('spawner', 'host_queues').replace(' ', '')
        # valid_queues = queues_list.split(',')
        #
        # if not channel_name or channel_name not in valid_queues:
        #     channel_name = 'default'
        #
        # if not channel_name:
        #     # TODO -- this must be changed. there is no way returning no arguments will result in
        #     # anythng but an exception. The calling function is expecting 3 arguments...
        #     # if we really want to blow up right here we should just raise an appropriate exception.
        #     return

    # TODO -- this code needs to be fixed. What follows is only a partial fix; what I think we want to do
    # is set the length of all of the different command channels once at the end of this loop. What was
    # happening instead was that it was only setting one of the command channel's lengths -- whatever command
    # channel happened to belong to the last actor in the loop.
    channel_name = 'default'
    ch = CommandChannel(name=channel_name)
    cmd_length = len(ch._queue._queue)
    command_gauge.labels(channel_name).set(cmd_length)
    logger.debug(
        f"METRICS COMMAND CHANNEL {channel_name} size: {command_gauge}")
    ch.close()

    # Return actor_ids so we don't have to query for them again later
    return actor_ids, inbox_lengths, cmd_length
Exemplo n.º 21
0
 def __init__(self):
     self.num_workers = int(Config.get('workers', 'init_count'))
     self.secret = os.environ.get('_abaco_secret')
     self.cmd_ch = CommandChannel()
Exemplo n.º 22
0
class Spawner(object):
    def __init__(self):
        self.num_workers = int(Config.get('workers', 'init_count'))
        self.secret = os.environ.get('_abaco_secret')
        self.cmd_ch = CommandChannel()

    def run(self):
        while True:
            cmd = self.cmd_ch.get()
            self.process(cmd)

    def stop_workers(self, actor_id, worker_ids):
        """Stop existing workers; used when updating an actor's image."""

        try:
            workers_dict = workers_store[actor_id]
        except KeyError:
            workers_dict = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers_dict.items()) > 0:
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()
            # now, send messages to workers for a graceful shutdown:
            for _, worker in workers_dict.items():
                # don't stop the new workers:
                if worker['id'] not in worker_ids:
                    ch = WorkerChannel(name=worker['ch_name'])
                    ch.put('stop')

    def process(self, cmd):
        print("Processing cmd:{}".format(str(cmd)))
        actor_id = cmd['actor_id']
        worker_ids = cmd['worker_ids']
        image = cmd['image']
        tenant = cmd['tenant']
        stop_existing = cmd.get('stop_existing', True)
        num_workers = cmd.get('num', self.num_workers)
        print("Actor id:{}".format(actor_id))
        try:
            new_channels, anon_channels, new_workers = self.start_workers(
                actor_id, worker_ids, image, tenant, num_workers)
        except SpawnerException as e:
            # for now, start_workers will do clean up for a SpawnerException, so we just need
            # to return back to the run loop.
            return
        print("Created new workers: {}".format(str(new_workers)))

        # stop any existing workers:
        if stop_existing:
            self.stop_workers(actor_id, worker_ids)

        # add workers to store first so that the records will be there when the workers go
        # to update their status
        if not stop_existing:
            for _, worker in new_workers.items():
                Worker.add_worker(actor_id, worker)
        else:
            workers_store[actor_id] = new_workers
        # Tell new worker to subscribe to the actor channel.
        # If abaco is configured to generate clients for the workers, generate them now
        # and send new workers their clients.
        generate_clients = Config.get('workers', 'generate_clients').lower()
        for idx, channel in enumerate(anon_channels):
            if generate_clients == 'true':
                print("Getting client for worker {}".format(idx))
                client_ch = ClientsChannel()
                client_msg = client_ch.request_client(
                    tenant=tenant,
                    actor_id=actor_id,
                    # new_workers is a dictionary of dictionaries; list(d) creates a
                    # list of keys for a dictionary d. hence, the idx^th entry
                    # of list(ner_workers) should be the key.
                    worker_id=new_workers[list(new_workers)[idx]]['id'],
                    secret=self.secret)
                # we need to ignore errors when generating clients because it's possible it is not set up for a specific
                # tenant. we log it instead.
                if client_msg.get('status') == 'error':
                    print("Error generating client: {}".format(
                        client_msg.get('message')))
                    channel.put({
                        'status': 'ok',
                        'actor_id': actor_id,
                        'tenant': tenant,
                        'client': 'no'
                    })
                # else, client was generated successfully:
                else:
                    print("Got a client: {}, {}, {}".format(
                        client_msg['client_id'], client_msg['access_token'],
                        client_msg['refresh_token']))
                    channel.put({
                        'status': 'ok',
                        'actor_id': actor_id,
                        'tenant': tenant,
                        'client': 'yes',
                        'client_id': client_msg['client_id'],
                        'client_secret': client_msg['client_secret'],
                        'access_token': client_msg['access_token'],
                        'refresh_token': client_msg['refresh_token'],
                        'api_server': client_msg['api_server'],
                    })
            else:
                print("Not generating clients. Config value was: {}".format(
                    generate_clients))
                channel.put({
                    'status': 'ok',
                    'actor_id': actor_id,
                    'tenant': tenant,
                    'client': 'no'
                })

        print("Done processing command.")

    def start_workers(self, actor_id, worker_ids, image, tenant, num_workers):
        print("starting {} workers. actor_id: {} image: {}".format(
            str(self.num_workers), actor_id, image))
        channels = []
        anon_channels = []
        workers = {}
        try:
            for i in range(num_workers):
                worker_id = worker_ids[i]
                print("starting worker {} with id: {}".format(i, worker_id))
                ch, anon_ch, worker = self.start_worker(
                    image, tenant, worker_id)
                print("channel for worker {} is: {}".format(str(i), ch.name))
                channels.append(ch)
                anon_channels.append(anon_ch)
                workers[worker_id] = worker
        except SpawnerException as e:
            print("Caught SpawnerException:{}".format(str(e)))
            # in case of an error, put the actor in error state and kill all workers
            Actor.set_status(actor_id, ERROR, status_message=e.message)
            for worker in workers:
                try:
                    self.kill_worker(worker)
                except DockerError as e:
                    print("Received DockerError trying to kill worker: {}".
                          format(str(e)))
            raise SpawnerException(message=e.message)
        return channels, anon_channels, workers

    def start_worker(self, image, tenant, worker_id):
        ch = WorkerChannel()
        # start an actor executor container and wait for a confirmation that image was pulled.
        worker_dict = run_worker(image, ch.name, worker_id)
        worker = Worker(tenant=tenant, **worker_dict)
        print(
            "worker started successfully, waiting on ack that image was pulled..."
        )
        result = ch.get()
        if result.get('status') == 'error':
            # there was a problem pulling the image; put the actor in an error state:
            msg = "got an error back from the worker. Message: {}", format(
                result)
            print(msg)
            if 'msg' in result:
                raise SpawnerException(message=result['msg'])
            else:
                raise SpawnerException(
                    message="Internal error starting worker process.")
        elif result['value']['status'] == 'ok':
            print("received ack from worker.")
            return ch, result['reply_to'], worker
        else:
            msg = "Got an error status from worker: {}. Raising an exception.".format(
                str(result))
            print(msg)
            raise SpawnerException(msg)

    def kill_worker(self, worker):
        pass
Exemplo n.º 23
0
class Spawner(object):

    def __init__(self):
        self.num_workers = int(Config.get('workers', 'init_count'))
        self.secret = os.environ.get('_abaco_secret')
        self.cmd_ch = CommandChannel()
        self.tot_workers = 0
        try:
            self.host_id = Config.get('spawner', 'host_id')
        except Exception as e:
            logger.critical("Spawner not configured with a host_id! Aborting! Exception: {}".format(e))
            raise e

    def run(self):
        while True:
            # check resource threshold before subscribing
            while True:
                if self.overloaded():
                    logger.critical("METRICS - SPAWNER FOR HOST {} OVERLOADED!!!".format(self.host_id))
                    # self.update_status to OVERLOADED
                    time.sleep(5)
                else:
                    break
            cmd, msg_obj = self.cmd_ch.get_one()
            # directly ack the messages from the command channel; problems generated from starting workers are
            # handled downstream; e.g., by setting the actor in an ERROR state; command messages should not be re-queued
            msg_obj.ack()
            self.process(cmd)

    def get_tot_workers(self):
        logger.debug("top of get_tot_workers")
        self.tot_workers = 0
        logger.debug('spawner host_id: {}'.format(self.host_id))
        for k,v in workers_store.items():
            for wid, worker in v.items():
                if worker.get('host_id') == self.host_id:
                    self.tot_workers += 1
        logger.debug("returning total workers: {}".format(self.tot_workers))
        return self.tot_workers

    def overloaded(self):
        logger.debug("top of overloaded")
        self.get_tot_workers()
        logger.info("total workers for this host: {}".format(self.tot_workers))
        if self.tot_workers >= MAX_WORKERS:
            return True

    def stop_workers(self, actor_id, worker_ids):
        """Stop existing workers; used when updating an actor's image."""
        logger.debug("Top of stop_workers() for actor: {}.".format(actor_id))
        try:
            workers_dict = workers_store[actor_id]
        except KeyError:
            logger.debug("workers_store had no workers for actor: {}".format(actor_id))
            workers_dict = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers_dict.items()) > 0:
            logger.info("Found {} workers to stop.".format(len(workers_dict.items())))
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()
            logger.info("Actor channel closed for actor: {}".format(actor_id))
            # now, send messages to workers for a graceful shutdown:
            for _, worker in workers_dict.items():
                # don't stop the new workers:
                if worker['id'] not in worker_ids:
                    ch = WorkerChannel(worker_id=worker['id'])
                    # since this is an update, there are new workers being started, so
                    # don't delete the actor msg channel:
                    ch.put('stop-no-delete')
                    logger.info("Sent 'stop-no-delete' message to worker_id: {}".format(worker['id']))
                    ch.close()
        else:
            logger.info("No workers to stop.")

    def process(self, cmd):
        """Main spawner method for processing a command from the CommandChannel."""
        logger.info("Spawner processing new command:{}".format(cmd))
        actor_id = cmd['actor_id']
        worker_ids = cmd['worker_ids']
        image = cmd['image']
        tenant = cmd['tenant']
        stop_existing = cmd.get('stop_existing', True)
        num_workers = cmd.get('num', self.num_workers)
        logger.info("command params: actor_id: {} worker_ids: {} image: {} stop_existing: {} mum_workers: {}".format(
            actor_id, worker_ids, image, tenant, stop_existing, num_workers))
        try:
            new_channels, anon_channels, new_workers = self.start_workers(actor_id,
                                                                          worker_ids,
                                                                          image,
                                                                          tenant,
                                                                          num_workers)
        except SpawnerException as e:
            # for now, start_workers will do clean up for a SpawnerException, so we just need
            # to return back to the run loop.
            logger.info("Spawner returning to main run loop.")
            return
        logger.info("Created new workers: {}".format(new_workers))

        # stop any existing workers:
        if stop_existing:
            logger.info("Stopping existing workers: {}".format(worker_ids))
            self.stop_workers(actor_id, worker_ids)

        # add workers to store first so that the records will be there when the workers go
        # to update their status
        if not stop_existing:
            # if we're not stopping the existing workers, we need to add each worker to the
            # actor's collection.
            for _, worker in new_workers.items():
                logger.info("calling add_worker for worker: {}.".format(worker))
                Worker.add_worker(actor_id, worker)
        else:
            # since we're stopping the existing workers, the actor's collection should just
            # be equal to the new_workers.
            workers_store[actor_id] = new_workers
            logger.info("workers_store set to new_workers: {}.".format(new_workers))

        # Tell new worker to subscribe to the actor channel.
        # If abaco is configured to generate clients for the workers, generate them now
        # and send new workers their clients.
        generate_clients = Config.get('workers', 'generate_clients').lower()
        logger.info("Sending messages to new workers over anonymous channels to subscribe to inbox.")
        for idx, channel in enumerate(anon_channels):
            if generate_clients == 'true':
                worker_id = new_workers[list(new_workers)[idx]]['id']
                logger.info("Getting client for worker number {}, id: {}".format(idx, worker_id))
                client_ch = ClientsChannel()
                try:
                    client_msg = client_ch.request_client(tenant=tenant,
                                                          actor_id=actor_id,
                                                          # new_workers is a dictionary of dictionaries; list(d) creates a
                                                          # list of keys for a dictionary d. hence, the idx^th entry
                                                          # of list(ner_workers) should be the key.
                                                          worker_id=worker_id,
                                                          secret=self.secret)
                except ChannelTimeoutException as e:
                    logger.error("Got a ChannelTimeoutException trying to generate a client for "
                                 "actor_id: {}; worker_id: {}; exception: {}".format(actor_id, worker_id, e))
                    # put actor in an error state and return
                    self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new "
                                                              "worker for this actor. System administrators have been notified.")
                    client_ch.close()
                    return
                client_ch.close()
                # we need to ignore errors when generating clients because it's possible it is not set up for a specific
                # tenant. we log it instead.
                if client_msg.get('status') == 'error':
                    logger.error("Error generating client: {}".format(client_msg.get('message')))
                    channel.put({'status': 'ok',
                                 'actor_id': actor_id,
                                 'tenant': tenant,
                                 'client': 'no'})
                    logger.debug("Sent OK message over anonymous worker channel.")
                # else, client was generated successfully:
                else:
                    logger.info("Got a client: {}, {}, {}".format(client_msg['client_id'],
                                                                  client_msg['access_token'],
                                                                  client_msg['refresh_token']))
                    channel.put({'status': 'ok',
                                 'actor_id': actor_id,
                                 'tenant': tenant,
                                 'client': 'yes',
                                 'client_id': client_msg['client_id'],
                                 'client_secret': client_msg['client_secret'],
                                 'access_token': client_msg['access_token'],
                                 'refresh_token': client_msg['refresh_token'],
                                 'api_server': client_msg['api_server'],
                                 })
                    logger.debug("Sent OK message AND client over anonymous worker channel.")
            else:
                logger.info("Not generating clients. Config value was: {}".format(generate_clients))
                channel.put({'status': 'ok',
                             'actor_id': actor_id,
                             'tenant': tenant,
                             'client': 'no'})
                logger.debug("Sent OK message over anonymous worker channel.")
            # @TODO -
            # delete the anonymous channel from this thread but sleep first to avoid the race condition.
            time.sleep(1.5)
            channel.delete()

        # due to the race condition deleting channels (potentially before all workers have received all messages)
        # we put a sleep here.
        time.sleep(1)
        for ch in new_channels:
            try:
                # the new_channels are the spawnerworker channels so they can be deleted.
                ch.delete()
            except Exception as e:
                logger.error("Got exception trying to delete spawnerworker channel: {}".format(e))
        logger.info("Done processing command.")

    def start_workers(self, actor_id, worker_ids, image, tenant, num_workers):
        logger.info("starting {} workers. actor_id: {} image: {}".format(str(self.num_workers), actor_id, image))
        channels = []
        anon_channels = []
        workers = {}
        try:
            for i in range(num_workers):
                worker_id = worker_ids[i]
                logger.info("starting worker {} with id: {}".format(i, worker_id))
                ch, anon_ch, worker = self.start_worker(image, tenant, actor_id, worker_id)
                logger.debug("channel for worker {} is: {}".format(str(i), ch.name))
                channels.append(ch)
                anon_channels.append(anon_ch)
                workers[worker_id] = worker
        except SpawnerException as e:
            logger.info("Caught SpawnerException:{}".format(str(e)))
            # in case of an error, put the actor in error state and kill all workers
            self.error_out_actor(actor_id, worker_id, e.message)
            raise SpawnerException(message=e.message)
        return channels, anon_channels, workers

    def start_worker(self, image, tenant, actor_id, worker_id):
        ch = SpawnerWorkerChannel(worker_id=worker_id)
        # start an actor executor container and wait for a confirmation that image was pulled.
        attempts = 0
        while True:
            try:
                worker_dict = run_worker(image, actor_id, worker_id)
            except DockerError as e:
                logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e))
                if 'read timeout' in e.message:
                    logger.info("Exception was a read timeout; trying run_worker again..")
                    time.sleep(5)
                    attempts = attempts + 1
                    if attempts > 20:
                        msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e)
                        logger.critical(msg)
                        raise SpawnerException(msg)
                    continue
                else:
                    logger.info("Exception was NOT a read timeout; quiting on this worker.")
                    # delete this worker from the workers store:
                    try:
                        self.kill_worker(actor_id, worker_id)
                    except WorkerException as e:
                        logger.info("Got WorkerException from delete_worker(). "
                                    "worker_id: {}"
                                    "Exception: {}".format(worker_id, e))

                    raise SpawnerException(message="Unable to start worker; error: {}".format(e))
            break
        worker_dict['ch_name'] = WorkerChannel.get_name(worker_id)
        worker = Worker(tenant=tenant, **worker_dict)
        logger.info("worker started successfully, waiting on ack that image was pulled...")
        result = ch.get()
        logger.debug("Got response back from worker. Response: {}".format(result))
        if result.get('status') == 'error':
            # there was a problem pulling the image; put the actor in an error state:
            msg = "Got an error back from the worker. Message: {}",format(result)
            logger.info(msg)
            if 'msg' in result:
                raise SpawnerException(message=result['msg'])
            else:
                logger.error("Spawner received invalid message from worker. 'msg' field missing. Message: {}".format(result))
                raise SpawnerException(message="Internal error starting worker process.")
        elif result['value']['status'] == 'ok':
            logger.debug("received ack from worker.")
            return ch, result['reply_to'], worker
        else:
            msg = "Got an error status from worker: {}. Raising an exception.".format(str(result))
            logger.error("Spawner received an invalid message from worker. Message: ".format(result))
            raise SpawnerException(msg)

    def error_out_actor(self, actor_id, worker_id, message):
        """In case of an error, put the actor in error state and kill all workers"""
        Actor.set_status(actor_id, ERROR, status_message=message)
        try:
            self.kill_worker(actor_id, worker_id)
        except DockerError as e:
            logger.info("Received DockerError trying to kill worker: {}. Exception: {}".format(worker_id, e))
            logger.info("Spawner will continue on since this is exception processing.")

    def kill_worker(self, actor_id, worker_id):
        try:
            Worker.delete_worker(actor_id, worker_id)
        except WorkerException as e:
            logger.info("Got WorkerException from delete_worker(). "
                        "worker_id: {}"
                        "Exception: {}".format(worker_id, e))
        except Exception as e:
            logger.error("Got an unexpected exception from delete_worker(). "
                        "worker_id: {}"
                        "Exception: {}".format(worker_id, e))
Exemplo n.º 24
0
def create_gauges(actor_ids):
    logger.debug(
        "METRICS: Made it to create_gauges; actor_ids: {}".format(actor_ids))
    inbox_lengths = {}
    for actor_id in actor_ids:
        logger.debug("top of for loop for actor_id: {}".format(actor_id))

        try:
            actor = actors_store[actor_id]
        except KeyError:
            logger.error("actor {} does not exist.".format(actor_id))
            continue

            # If the actor doesn't have a gauge, add one
        if actor_id not in message_gauges.keys():
            try:
                g = Gauge(
                    'message_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of messages for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                message_gauges.update({actor_id: g})
                logger.debug('Created gauge {}'.format(g))
            except Exception as e:
                logger.error(
                    "got exception trying to create/instantiate the gauge; "
                    "actor {}; exception: {}".format(actor_id, e))
        else:
            # Otherwise, get this actor's existing gauge
            try:
                g = message_gauges[actor_id]
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate an existing gauge; "
                    "actor: {}: exception:{}".format(actor_id, e))

            # Update this actor's command channel metric
            channel_name = actor.get("queue")

            queues_list = Config.get('spawner', 'host_queues').replace(' ', '')
            valid_queues = queues_list.split(',')

            if not channel_name or channel_name not in valid_queues:
                channel_name = 'default'

        # Update this actor's gauge to its current # of messages
        try:
            ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8"))
        except Exception as e:
            logger.error(
                "Exception connecting to ActorMsgChannel: {}".format(e))
            raise e
        result = {'messages': len(ch._queue._queue)}
        inbox_lengths[actor_id.decode("utf-8")] = len(ch._queue._queue)
        ch.close()
        g.set(result['messages'])
        logger.debug("METRICS: {} messages found for actor: {}.".format(
            result['messages'], actor_id))

        # add a worker gauge for this actor if one does not exist
        if actor_id not in worker_gaueges.keys():
            try:
                g = Gauge(
                    'worker_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of workers for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                worker_gaueges.update({actor_id: g})
                logger.debug('Created worker gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Worker Gauge: {}".
                    format(e))
        else:
            # Otherwise, get the worker gauge that already exists
            g = worker_gaueges[actor_id]

        # Update this actor's worker IDs
        workers = Worker.get_workers(actor_id)
        result = {'workers': len(workers)}
        g.set(result['workers'])

    ch = CommandChannel(name=channel_name)
    cmd_length = len(ch._queue._queue)
    command_gauge.labels(channel_name).set(cmd_length)
    logger.debug("METRICS COMMAND CHANNEL {} size: {}".format(
        channel_name, command_gauge))
    ch.close()

    # Return actor_ids so we don't have to query for them again later
    return actor_ids, inbox_lengths, cmd_length
Exemplo n.º 25
0
class Spawner(object):

    def __init__(self):
        self.num_workers = int(Config.get('workers', 'init_count'))
        self.secret = os.environ.get('_abaco_secret')
        self.cmd_ch = CommandChannel()

    def run(self):
        while True:
            cmd = self.cmd_ch.get()
            self.process(cmd)

    def stop_workers(self, actor_id):
        """Stop existing workers; used when updating an actor's image."""

        try:
            workers_dict = workers_store[actor_id]
        except KeyError:
            workers_dict = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers_dict.items()) > 0:
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()
            # now, send messages to workers for a graceful shutdown:
            for _, worker in workers_dict.items():
                ch = WorkerChannel(name=worker['ch_name'])
                ch.put('stop')


    def process(self, cmd):
        print("Processing cmd:{}".format(str(cmd)))
        actor_id = cmd['actor_id']
        image = cmd['image']
        tenant = cmd['tenant']
        stop_existing = cmd.get('stop_existing', True)
        num_workers = cmd.get('num', self.num_workers)
        print("Actor id:{}".format(actor_id))
        try:
            new_channels, anon_channels, new_workers = self.start_workers(actor_id, image, tenant, num_workers)
        except SpawnerException as e:
            # for now, start_workers will do clean up for a SpawnerException, so we just need
            # to return back to the run loop.
            return
        print("Created new workers: {}".format(str(new_workers)))

        # stop any existing workers:
        if stop_existing:
            self.stop_workers(actor_id)

        # add workers to store first so that the records will be there when the workers go
        # to update their status
        if not stop_existing:
            for _, worker in new_workers.items():
                Worker.add_worker(actor_id, worker)
        else:
            workers_store[actor_id] = new_workers
        # send new workers their clients and tell them to subscribe to the actor channel.
        for idx, channel in enumerate(anon_channels):
            print("Getting client for worker {}".format(idx))
            client_ch = ClientsChannel()
            client_msg = client_ch.request_client(tenant=tenant,
                                                  actor_id=actor_id,
                                                  # new_workers is a dictionary of dictionaries; list(d) creates a
                                                  # list of keys for a dictionary d. hence, the idx^th entry
                                                  # of list(ner_workers) should be the key.
                                                  worker_id=new_workers[list(new_workers)[idx]]['ch_name'],
                                                  secret=self.secret)
            # we need to ignore errors when generating clients because it's possible it is not set up for a specific
            # tenant. we log it instead.
            if client_msg.get('status') == 'error':
                print("Error generating client: {}".format(client_msg.get('message')))
                channel.put({'status': 'ok',
                             'actor_id': actor_id,
                             'tenant': tenant,
                             'client': 'no'})
            # else, client was generated successfully:
            else:
                print("Got a client: {}, {}, {}".format(client_msg['client_id'],
                                                        client_msg['access_token'],
                                                        client_msg['refresh_token']))
                channel.put({'status': 'ok',
                             'actor_id': actor_id,
                             'tenant': tenant,
                             'client': 'yes',
                             'client_id': client_msg['client_id'],
                             'client_secret': client_msg['client_secret'],
                             'access_token': client_msg['access_token'],
                             'refresh_token': client_msg['refresh_token'],
                             'api_server': client_msg['api_server'],
                             })
        print("Done processing command.")


    def start_workers(self, actor_id, image, tenant, num_workers):
        print("starting {} workers. actor_id: {} image: {}".format(str(self.num_workers), actor_id, image))
        channels = []
        anon_channels = []
        workers = {}
        try:
            for i in range(num_workers):
                print("starting worker {}".format(str(i)))
                ch, anon_ch, worker = self.start_worker(image, tenant)
                print("channel for worker {} is: {}".format(str(i), ch.name))
                channels.append(ch)
                anon_channels.append(anon_ch)
                workers[worker['ch_name']] = worker
        except SpawnerException as e:
            print("Caught SpawnerException:{}".format(str(e)))
            # in case of an error, put the actor in error state and kill all workers
            Actor.set_status(actor_id, ERROR)
            for worker in workers:
                try:
                    self.kill_worker(worker)
                except DockerError as e:
                    print("Received DockerError trying to kill worker: {}".format(str(e)))
            raise SpawnerException()
        return channels, anon_channels, workers

    def start_worker(self, image, tenant):
        ch = WorkerChannel()
        # start an actor executor container and wait for a confirmation that image was pulled.
        worker_dict = run_worker(image, ch.name)
        worker = Worker(tenant=tenant, **worker_dict)
        print("worker started successfully, waiting on ack that image was pulled...")
        result = ch.get()
        if result['value']['status'] == 'ok':
            print("received ack from worker.")
            return ch, result['reply_to'], worker
        else:
            print("Got an error status from worker: {}. Raising an exception.".format(str(result)))
            raise SpawnerException()

    def kill_worker(self, worker):
        pass
Exemplo n.º 26
0
class Spawner(object):

    def __init__(self):
        self.num_workers = int(Config.get('workers', 'init_count'))
        self.secret = os.environ.get('_abaco_secret')
        self.queue = os.environ.get('queue', 'default')
        self.cmd_ch = CommandChannel(name=self.queue)
        self.tot_workers = 0
        try:
            self.host_id = Config.get('spawner', 'host_id')
        except Exception as e:
            logger.critical("Spawner not configured with a host_id! Aborting! Exception: {}".format(e))
            raise e

    def run(self):
        while True:
            # check resource threshold before subscribing
            while True:
                if self.overloaded():
                    logger.critical("METRICS - SPAWNER FOR HOST {} OVERLOADED!!!".format(self.host_id))
                    # self.update_status to OVERLOADED
                    time.sleep(5)
                else:
                    break
            cmd, msg_obj = self.cmd_ch.get_one()
            # directly ack the messages from the command channel; problems generated from starting workers are
            # handled downstream; e.g., by setting the actor in an ERROR state; command messages should not be re-queued
            msg_obj.ack()
            try:
                self.process(cmd)
            except Exception as e:
                logger.error("spawner got an exception trying to process cmd: {}. "
                             "Exception type: {}. Exception: {}".format(cmd, type(e), e))

    def get_tot_workers(self):
        logger.debug("top of get_tot_workers")
        self.tot_workers = 0
        logger.debug('spawner host_id: {}'.format(self.host_id))
        for k,v in workers_store.items():
            for wid, worker in v.items():
                if worker.get('host_id') == self.host_id:
                    self.tot_workers += 1
        logger.debug("returning total workers: {}".format(self.tot_workers))
        return self.tot_workers

    def overloaded(self):
        logger.debug("top of overloaded")
        self.get_tot_workers()
        logger.info("total workers for this host: {}".format(self.tot_workers))
        if self.tot_workers >= MAX_WORKERS:
            return True

    def stop_workers(self, actor_id, worker_ids):
        """Stop existing workers; used when updating an actor's image."""
        logger.debug("Top of stop_workers() for actor: {}.".format(actor_id))
        try:
            workers_dict = workers_store[actor_id]
        except KeyError:
            logger.debug("workers_store had no workers for actor: {}".format(actor_id))
            workers_dict = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers_dict.items()) > 0:
            logger.info("Found {} workers to stop.".format(len(workers_dict.items())))
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()
            logger.info("Actor channel closed for actor: {}".format(actor_id))
            # now, send messages to workers for a graceful shutdown:
            for _, worker in workers_dict.items():
                # don't stop the new workers:
                if worker['id'] not in worker_ids:
                    ch = WorkerChannel(worker_id=worker['id'])
                    # since this is an update, there are new workers being started, so
                    # don't delete the actor msg channel:
                    ch.put('stop-no-delete')
                    logger.info("Sent 'stop-no-delete' message to worker_id: {}".format(worker['id']))
                    ch.close()
                else:
                    logger.debug("skipping worker {} as it it not in worker_ids.".format(worker))
        else:
            logger.info("No workers to stop.")

    def process(self, cmd):
        """Main spawner method for processing a command from the CommandChannel."""
        logger.info("top of process; cmd: {}".format(cmd))
        actor_id = cmd['actor_id']
        try:
            actor = Actor.from_db(actors_store[actor_id])
        except Exception as e:
            msg = f"Exception in spawner trying to retrieve actor object from store. Aborting. Exception: {e}"
            logger.error(msg)
            return
        worker_id = cmd['worker_id']
        image = cmd['image']
        tenant = cmd['tenant']
        stop_existing = cmd.get('stop_existing', True)
        num_workers = 1
        logger.debug("spawner command params: actor_id: {} worker_id: {} image: {} tenant: {}"
                    "stop_existing: {} num_workers: {}".format(actor_id, worker_id,
                                                               image, tenant, stop_existing, num_workers))
        # if the worker was sent a delete request before spawner received this message to create the worker,
        # the status will be SHUTDOWN_REQUESTED, not REQUESTED. in that case, we simply abort and remove the
        # worker from the collection.
        try:
            logger.debug("spawner checking worker's status for SHUTDOWN_REQUESTED")
            worker = Worker.get_worker(actor_id, worker_id)
            logger.debug(f"spawner got worker; worker: {worker}")
        except Exception as e:
            logger.error(f"spawner got exception trying to retrieve worker. "
                         f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}")
            return

        status = worker.get('status')
        if not status == REQUESTED:
            logger.debug(f"worker was NOT in REQUESTED status. status: {status}")
            if status == SHUTDOWN_REQUESTED or status == SHUTTING_DOWN or status == ERROR:
                logger.debug(f"worker status was {status}; spawner deleting worker and returning..")
                try:
                    Worker.delete_worker(actor_id, worker_id)
                    logger.debug("spawner deleted worker because it was SHUTDOWN_REQUESTED.")
                    return
                except Exception as e:
                    logger.error(f"spawner got exception trying to delete a worker in SHUTDOWN_REQUESTED status."
                                 f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}")
                    return
            else:
                logger.error(f"spawner found worker in unexpected status: {status}. Not processing command and returning.")
                return

        # worker status was REQUESTED; moving on to SPAWNER_SETUP ----
        Worker.update_worker_status(actor_id, worker_id, SPAWNER_SETUP)
        logger.debug("spawner has updated worker status to SPAWNER_SETUP; worker_id: {}".format(worker_id))
        client_id = None
        client_secret = None
        client_access_token = None
        client_refresh_token = None
        api_server = None
        client_secret = None

        # ---- Oauth client generation for the worker -------
        # check if tenant and instance configured for client generation -
        try:
            generate_clients =  Config.get('workers', f'{tenant}_generate_clients').lower()
        except:
            logger.debug(f"Did not find a {tenant}_generate_clients config. Looking for a global config.")
            generate_clients = Config.get('workers', 'generate_clients').lower()
        logger.debug(f"final generate_clients: {generate_clients}")
        if generate_clients == "true":
            logger.debug("client generation was configured to be available; now checking the actor's token attr.")
            # updated 1.3.0-- check whether the actor requires a token:
            if actor.token:
                logger.debug("spawner starting client generation")
                client_id, \
                client_access_token, \
                client_refresh_token, \
                api_server, \
                client_secret = self.client_generation(actor_id, worker_id, tenant)
            else:
                logger.debug("actor's token attribute was False. Not generating client.")
        ch = SpawnerWorkerChannel(worker_id=worker_id)

        logger.debug("spawner attempting to start worker; worker_id: {}".format(worker_id))
        try:
            worker = self.start_worker(
                image,
                tenant,
                actor_id,
                worker_id,
                client_id,
                client_access_token,
                client_refresh_token,
                ch,
                api_server,
                client_secret
            )
        except Exception as e:
            msg = "Spawner got an exception from call to start_worker. Exception:{}".format(e)
            logger.error(msg)
            self.error_out_actor(actor_id, worker_id, msg)
            if client_id:
                self.delete_client(tenant, actor_id, worker_id, client_id, client_secret)
            return

        logger.debug("Returned from start_worker; Created new worker: {}".format(worker))
        ch.close()
        logger.debug("Client channel closed")

        if stop_existing:
            logger.info("Stopping existing workers: {}".format(worker_id))
            # TODO - update status to stop_requested
            self.stop_workers(actor_id, [worker_id])


    def client_generation(self, actor_id, worker_id, tenant):
        client_ch = ClientsChannel()
        try:
            client_msg = client_ch.request_client(
                tenant=tenant,
                actor_id=actor_id,
                worker_id=worker_id,
                secret=self.secret
            )
        except Exception as e:
            logger.error("Got a ChannelTimeoutException trying to generate a client for "
                         "actor_id: {}; worker_id: {}; exception: {}".format(actor_id, worker_id, e))
            # put worker in an error state and return
            self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new "
                                                      "worker for this actor. System administrators have been notified.")
            client_ch.close()
            Worker.update_worker_status(actor_id, worker_id, ERROR)
            logger.critical("Client generation FAILED.")
            raise e

        client_ch.close()


        if client_msg.get('status') == 'error':
            logger.error("Error generating client: {}".format(client_msg.get('message')))
            self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new "
                                                      "worker for this actor. System administrators have been notified.")
            Worker.update_worker_status(actor_id, worker_id, ERROR)
            raise SpawnerException("Error generating client") #TODO - clean up error message
        # else, client was generated successfully:
        else:
            logger.info("Got a client: {}, {}, {}".format(client_msg['client_id'],
                                                          client_msg['access_token'],
                                                          client_msg['refresh_token']))
            return client_msg['client_id'], \
                   client_msg['access_token'],  \
                   client_msg['refresh_token'], \
                   client_msg['api_server'], \
                   client_msg['client_secret']

    def delete_client(self, tenant, actor_id, worker_id, client_id, secret):
        clients_ch = ClientsChannel()
        msg = clients_ch.request_delete_client(tenant=tenant,
                                               actor_id=actor_id,
                                               worker_id=worker_id,
                                               client_id=client_id,
                                               secret=secret)
        if msg['status'] == 'ok':
            logger.info("Client delete request completed successfully for "
                        "worker_id: {}, client_id: {}.".format(worker_id, client_id))
        else:
            logger.error("Error deleting client for "
                         "worker_id: {}, client_id: {}. Message: {}".format(worker_id, msg['message'], client_id, msg))
        clients_ch.close()

    def start_worker(self,
                     image,
                     tenant,
                     actor_id,
                     worker_id,
                     client_id,
                     client_access_token,
                     client_refresh_token,
                     ch,
                     api_server,
                     client_secret):

        # start an actor executor container and wait for a confirmation that image was pulled.
        attempts = 0
        # worker = get_worker(worker_id)
        # worker['status'] = PULLING_IMAGE
        Worker.update_worker_status(actor_id, worker_id, PULLING_IMAGE)
        try:
            logger.debug("Worker pulling image {}...".format(image))
            pull_image(image)
        except DockerError as e:
            # return a message to the spawner that there was an error pulling image and abort
            # this is not necessarily an error state: the user simply could have provided an
            # image name that does not exist in the registry. This is the first time we would
            # find that out.
            logger.info("worker got a DockerError trying to pull image. Error: {}.".format(e))
            raise e
        logger.info("Image {} pulled successfully.".format(image))
        # Done pulling image
        # Run Worker Container
        while True:
            try:
                Worker.update_worker_status(actor_id, worker_id, CREATING_CONTAINER)
                logger.debug('spawner creating worker container')
                worker_dict = run_worker(
                    image,
                    actor_id,
                    worker_id,
                    client_id,
                    client_access_token,
                    client_refresh_token,
                    tenant,
                    api_server,
                    client_secret

                )
                logger.debug(f'finished run worker; worker dict: {worker_dict}')
            except DockerError as e:
                logger.error("Spawner got a docker exception from run_worker; Exception: {}".format(e))
                if 'read timeout' in e.message:
                    logger.info("Exception was a read timeout; trying run_worker again..")
                    time.sleep(5)
                    attempts = attempts + 1
                    if attempts > 20:
                        msg = "Spawner continued to get DockerError for 20 attempts. Exception: {}".format(e)
                        logger.critical(msg)
                        # todo - should we be calling kill_worker here? (it is called in the exception block of the else below)
                        raise SpawnerException(msg)
                    continue
                else:
                    logger.info("Exception was NOT a read timeout; quiting on this worker.")
                    # delete this worker from the workers store:
                    try:
                        self.kill_worker(actor_id, worker_id)
                    except WorkerException as e:
                        logger.info("Got WorkerException from delete_worker(). "
                                    "worker_id: {}"
                                    "Exception: {}".format(worker_id, e))

                    raise SpawnerException(message="Unable to start worker; error: {}".format(e))
            break
        logger.debug('finished loop')
        worker_dict['ch_name'] = WorkerChannel.get_name(worker_id)
        # if the actor is not already in READY status, set actor status to READY before worker status has been
        # set to READY.
        # it is possible the actor status is already READY because this request is the autoscaler starting a new worker
        # for an existing actor.
        actor = Actor.from_db(actors_store[actor_id])
        if not actor.status == READY:
            try:
                Actor.set_status(actor_id, READY, status_message=" ")
            except KeyError:
                # it is possible the actor was already deleted during worker start up; if
                # so, the worker should have a stop message waiting for it. starting subscribe
                # as usual should allow this process to work as expected.
                pass
        # finalize worker with READY status
        worker = Worker(tenant=tenant, **worker_dict)
        logger.info("calling add_worker for worker: {}.".format(worker))
        Worker.add_worker(actor_id, worker)

        ch.put('READY')  # step 4
        logger.info('sent message through channel')

    def error_out_actor(self, actor_id, worker_id, message):
        """In case of an error, put the actor in error state and kill all workers"""
        logger.debug("top of error_out_actor for worker: {}_{}".format(actor_id, worker_id))
        Actor.set_status(actor_id, ERROR, status_message=message)
        # first we try to stop workers using the "graceful" approach -
        try:
            self.stop_workers(actor_id, worker_ids=[])
            logger.info("Spawner just stopped worker {}_{} in error_out_actor".format(actor_id, worker_id))
            return
        except Exception as e:
            logger.error("spawner got exception trying to run stop_workers. Exception: {}".format(e))
        try:
            self.kill_worker(actor_id, worker_id)
            logger.info("Spawner just killed worker {}_{} in error_out_actor".format(actor_id, worker_id))
        except DockerError as e:
            logger.info("Received DockerError trying to kill worker: {}. Exception: {}".format(worker_id, e))
            logger.info("Spawner will continue on since this is exception processing.")

    def kill_worker(self, actor_id, worker_id):
        try:
            Worker.delete_worker(actor_id, worker_id)
        except WorkerException as e:
            logger.info("Got WorkerException from delete_worker(). "
                        "worker_id: {}"
                        "Exception: {}".format(worker_id, e))
        except Exception as e:
            logger.error("Got an unexpected exception from delete_worker(). "
                        "worker_id: {}"
                        "Exception: {}".format(worker_id, e))
Exemplo n.º 27
0
 def __init__(self):
     self.num_workers = int(Config.get("workers", "init_count"))
     self.secret = os.environ.get("_abaco_secret")
     self.cmd_ch = CommandChannel()