示例#1
0
 def delete(self, actor_id):
     logger.debug("top of DELETE /actors/{}".format(actor_id))
     id = Actor.get_dbid(g.tenant, actor_id)
     logger.info("calling shutdown_workers() for actor: {}".format(id))
     shutdown_workers(id)
     logger.debug("shutdown_workers() done")
     try:
         actor = Actor.from_db(actors_store[id])
         executions = actor.get('executions') or {}
         for ex_id, val in executions.items():
             del logs_store[ex_id]
     except KeyError as e:
         logger.info("got KeyError {} trying to retrieve actor or executions with id {}".format(
             e, id))
     # delete the actor's message channel
     # TODO - needs work; each worker is subscribed to the ActorMsgChannel. If the workers are not
     # closed before the ch.delete() below, the ActorMsgChannel will survive.
     try:
         ch = ActorMsgChannel(actor_id=id)
         ch.delete()
         logger.info("Deleted actor message channel for actor: {}".format(id))
     except Exception as e:
         # if we get an error trying to remove the inbox, log it but keep going
         logger.error("Unable to delete the actor's message channel for actor: {}, exception: {}".format(id, e))
     del actors_store[id]
     logger.info("actor {} deleted from store.".format(id))
     del permissions_store[id]
     logger.info("actor {} permissions deleted from store.".format(id))
     del nonce_store[id]
     logger.info("actor {} nonnces delete from nonce store.".format(id))
     return ok(result=None, msg='Actor deleted successfully.')
示例#2
0
def subscribe(actor_id, worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    actor_ch = ActorMsgChannel(actor_id)
    t = threading.Thread(target=process_worker_ch, args=(worker_ch, actor_id, actor_ch))
    t.start()
    print("Worker subscribing to actor channel...")
    while keep_running:
        update_worker_status(actor_id, worker_ch.name, READY)
        try:
            msg = actor_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        print("Received message {}. Starting actor container...".format(str(msg)))
        message = msg.pop("msg", "")
        try:
            stats, logs = execute_actor(actor_id, worker_ch, image, message, msg)
        except DockerStartContainerError as e:
            print("Got DockerStartContainerError: {}".format(str(e)))
            Actor.set_status(actor_id, ERROR)
            continue
        # add the execution to the actor store
        print("Actor container finished successfully. Got stats object:{}".format(str(stats)))
        exc_id = Execution.add_execution(actor_id, stats)
        Execution.set_logs(exc_id, logs)
示例#3
0
    def get(self, actor_id):
        def get_hypermedia(actor):
            return {
                '_links': {
                    'self':
                    '{}/actors/v2/{}/messages'.format(actor.api_server,
                                                      actor.id),
                    'owner':
                    '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                },
            }

        logger.debug("top of GET /actors/{}/messages".format(actor_id))
        # check that actor exists
        id = Actor.get_dbid(g.tenant, actor_id)
        try:
            actor = Actor.from_db(actors_store[id])
        except KeyError:
            logger.debug("did not find actor: {}.".format(actor_id))
            raise ResourceError("No actor found with id: {}.".format(actor_id),
                                404)
        ch = ActorMsgChannel(actor_id=id)
        result = {'messages': len(ch._queue._queue)}
        ch.close()
        logger.debug("messages found for actor: {}.".format(actor_id))
        result.update(get_hypermedia(actor))
        return ok(result)
示例#4
0
    def stop_workers(self, actor_id, worker_ids):
        """Stop existing workers; used when updating an actor's image."""
        logger.debug("Top of stop_workers() for actor: {}.".format(actor_id))
        try:
            workers_dict = workers_store[actor_id]
        except KeyError:
            logger.debug("workers_store had no workers for actor: {}".format(actor_id))
            workers_dict = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers_dict.items()) > 0:
            logger.info("Found {} workers to stop.".format(len(workers_dict.items())))
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()
            logger.info("Actor channel closed for actor: {}".format(actor_id))
            # now, send messages to workers for a graceful shutdown:
            for _, worker in workers_dict.items():
                # don't stop the new workers:
                if worker['id'] not in worker_ids:
                    ch = WorkerChannel(worker_id=worker['id'])
                    # since this is an update, there are new workers being started, so
                    # don't delete the actor msg channel:
                    ch.put('stop-no-delete')
                    logger.info("Sent 'stop-no-delete' message to worker_id: {}".format(worker['id']))
                    ch.close()
                else:
                    logger.debug("skipping worker {} as it it not in worker_ids.".format(worker))
        else:
            logger.info("No workers to stop.")
示例#5
0
    def post(self, actor_id):
        def get_hypermedia(actor, exc):
            return {
                '_links': {
                    'self':
                    '{}/actors/v2/{}/executions/{}'.format(
                        actor.api_server, actor.id, exc),
                    'owner':
                    '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                    'messages':
                    '{}/actors/v2/{}/messages'.format(actor.api_server,
                                                      actor.id)
                },
            }

        args = self.validate_post()
        d = {}
        # build a dictionary of k:v pairs from the query parameters, and pass a single
        # additional object 'message' from within the post payload. Note that 'message'
        # need not be JSON data.
        for k, v in request.args.items():
            if k == 'message':
                continue
            d[k] = v
        if hasattr(g, 'user'):
            d['_abaco_username'] = g.user
        if hasattr(g, 'api_server'):
            d['_abaco_api_server'] = g.api_server
        # if hasattr(g, 'jwt'):
        #     d['_abaco_jwt'] = g.jwt
        # if hasattr(g, 'jwt_server'):
        #     d['_abaco_jwt_server'] = g.jwt_server
        if hasattr(g, 'jwt_header_name'):
            d['_abaco_jwt_header_name'] = g.jwt_header_name
        dbid = Actor.get_dbid(g.tenant, actor_id)
        # create an execution
        exc = Execution.add_execution(
            dbid, {
                'cpu': 0,
                'io': 0,
                'runtime': 0,
                'status': SUBMITTED,
                'executor': g.user
            })
        d['_abaco_execution_id'] = exc
        d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '')
        ch = ActorMsgChannel(actor_id=dbid)
        ch.put_msg(message=args['message'], d=d)
        # make sure at least one worker is available
        actor = Actor.from_db(actors_store[dbid])
        actor.ensure_one_worker()
        result = {'execution_id': exc, 'msg': args['message']}
        result.update(get_hypermedia(actor, exc))
        case = Config.get('web', 'case')
        if not case == 'camel':
            return ok(result)
        else:
            return ok(dict_to_camel(result))
示例#6
0
 def post(self, actor_id):
     args = self.validate_post()
     d = {}
     for k, v in request.args.items():
         if k == 'message':
             continue
         d[k] = v
     ch = ActorMsgChannel(actor_id=actor_id)
     ch.put_msg(msg=args['message'], d=d)
     return ok(result={'msg': args['message']})
示例#7
0
def create_gauges(actor_ids):
    logger.debug("METRICS: Made it to create_gauges")
    for actor_id in actor_ids:
        if actor_id not in message_gauges.keys():
            try:
                g = Gauge(
                    'message_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of messages for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                message_gauges.update({actor_id: g})
                logger.debug('Created gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Gauge: {}".format(
                        e))
        else:
            g = message_gauges[actor_id]

        try:
            ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8"))
        except Exception as e:
            logger.error(
                "Exception connecting to ActorMsgChannel: {}".format(e))
            raise e
        result = {'messages': len(ch._queue._queue)}
        ch.close()
        g.set(result['messages'])
        logger.debug("METRICS: {} messages found for actor: {}.".format(
            result['messages'], actor_id))
        if actor_id not in worker_gaueges.keys():
            try:
                g = Gauge(
                    'worker_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of workers for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                worker_gaueges.update({actor_id: g})
                logger.debug('Created worker gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Worker Gauge: {}".
                    format(e))
        else:
            g = worker_gaueges[actor_id]
        workers = Worker.get_workers(actor_id)
        result = {'workers': len(workers)}
        g.set(result['workers'])

    return actor_ids
示例#8
0
 def get(self):
     logger.debug("top of GET /admin/actors")
     actors = []
     for k, v in actors_store.items():
         actor = Actor.from_db(v)
         actor.workers = Worker.get_workers(actor.db_id)
         for id, worker in actor.workers.items():
             actor.worker = worker
             break
         ch = ActorMsgChannel(actor_id=actor.db_id)
         actor.messages = len(ch._queue._queue)
         ch.close()
         summary = ExecutionsSummary(db_id=actor.db_id)
         actor.executions = summary.total_executions
         actor.runtime = summary.total_runtime
         actors.append(actor)
     logger.info("actors retrieved.")
     return ok(result=actors, msg="Actors retrieved successfully.")
示例#9
0
文件: spawner.py 项目: TACC/abaco
    def stop_workers(self, actor_id):
        """Stop existing workers; used when updating an actor's image."""

        try:
            workers_dict = workers_store[actor_id]
        except KeyError:
            workers_dict = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers_dict.items()) > 0:
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()
            # now, send messages to workers for a graceful shutdown:
            for _, worker in workers_dict.items():
                ch = WorkerChannel(name=worker['ch_name'])
                ch.put('stop')
示例#10
0
    def stop_workers(self, actor_id, worker_ids):
        """Stop existing workers; used when updating an actor's image."""

        try:
            workers_dict = workers_store[actor_id]
        except KeyError:
            workers_dict = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers_dict.items()) > 0:
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()
            # now, send messages to workers for a graceful shutdown:
            for _, worker in workers_dict.items():
                # don't stop the new workers:
                if worker['id'] not in worker_ids:
                    ch = WorkerChannel(name=worker['ch_name'])
                    ch.put('stop')
示例#11
0
    def stop_workers(self, actor_id):
        """Stop existing workers; used when updating an actor's image."""

        try:
            workers = json.loads(workers_store[actor_id])
            print("Found existing workers: {}".format(str(workers)))
        except KeyError:
            print("No existing workers.")
            workers = {}

        # if there are existing workers, we need to close the actor message channel and
        # gracefully shutdown the existing worker processes.
        if len(workers) > 0 :
            # first, close the actor msg channel to prevent any new messages from being pulled
            # by the old workers.
            actor_ch = ActorMsgChannel(actor_id)
            actor_ch.close()

            # now, send messages to workers for a graceful shutdown:
            for worker in workers:
                ch = WorkerChannel(name=worker['ch_name'])
                ch.put('stop')
示例#12
0
    def get_metrics(self):
        logger.debug("top of get in MetricResource")

        actor_ids = [
            db_id
            for db_id, _
            in actors_store.items()
        ]
        logger.debug("ACTOR IDS: {}".format(actor_ids))
        try:
            if actor_ids:
                for actor_id in actor_ids:
                    if actor_id not in message_gauges.keys():
                        try:
                            g = Gauge(
                                'message_count_for_actor_{}'.format(actor_id.decode("utf-8").replace('-', '_')),
                                'Number of messages for actor {}'.format(actor_id.decode("utf-8").replace('-', '_'))
                            )
                            message_gauges.update({actor_id: g})
                        except Exception as e:
                            logger.info("got exception trying to instantiate the Gauge: {}".format(e))
                    else:
                        g = message_gauges[actor_id]

                    try:
                        ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8"))
                    except Exception as e:
                        logger.error("Exception connecting to ActorMsgChannel: {}".format(e))
                        raise e
                    result = {'messages': len(ch._queue._queue)}
                    ch.close()
                    g.set(result['messages'])
                    logger.debug("METRICS: {} messages found for actor: {}.".format(result['messages'], actor_id))
                return actor_ids
        except Exception as e:
            logger.info("Got exception in get_metrics: {}".format(e))
            return []
示例#13
0
文件: controllers.py 项目: TACC/abaco
    def post(self, actor_id):
        def get_hypermedia(actor, exc):
            return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc),
                               'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                               'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},}

        args = self.validate_post()
        d = {}
        # build a dictionary of k:v pairs from the query parameters, and pass a single
        # additional object 'message' from within the post payload. Note that 'message'
        # need not be JSON data.
        for k, v in request.args.items():
            if k == 'message':
                continue
            d[k] = v
        if hasattr(g, 'user'):
            d['_abaco_username'] = g.user
        if hasattr(g, 'api_server'):
            d['_abaco_api_server'] = g.api_server
        # if hasattr(g, 'jwt'):
        #     d['_abaco_jwt'] = g.jwt
        # if hasattr(g, 'jwt_server'):
        #     d['_abaco_jwt_server'] = g.jwt_server
        if hasattr(g, 'jwt_header_name'):
            d['_abaco_jwt_header_name'] = g.jwt_header_name
        dbid = Actor.get_dbid(g.tenant, actor_id)
        # create an execution
        exc = Execution.add_execution(dbid, {'cpu': 0,
                                             'io': 0,
                                             'runtime': 0,
                                             'status': SUBMITTED,
                                             'executor': g.user})
        d['_abaco_execution_id'] = exc
        d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '')
        ch = ActorMsgChannel(actor_id=dbid)
        ch.put_msg(message=args['message'], d=d)
        # make sure at least one worker is available
        workers = Worker.get_workers(dbid)
        actor = Actor.from_db(actors_store[dbid])
        if len(workers.items()) < 1:
            ch = CommandChannel()
            ch.put_cmd(actor_id=dbid, image=actor.image, tenant=g.tenant, num=1, stop_existing=False)
        result={'execution_id': exc, 'msg': args['message']}
        result.update(get_hypermedia(actor, exc))
        case = Config.get('web', 'case')
        if not case == 'camel':
            return ok(result)
        else:
            return ok(dict_to_camel(result))
示例#14
0
    def get(self, actor_id):
        def get_hypermedia(actor):
            return {
                '_links': {
                    'self':
                    '{}/actors/v2/{}/messages'.format(actor.api_server,
                                                      actor.id),
                    'owner':
                    '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                },
            }

        # check that actor exists
        id = Actor.get_dbid(g.tenant, actor_id)
        try:
            actor = Actor.from_db(actors_store[id])
        except KeyError:
            raise ResourceError("actor not found: {}'".format(actor_id), 404)
        result = {'messages': len(ActorMsgChannel(actor_id=id)._queue._queue)}
        result.update(get_hypermedia(actor))
        return ok(result)
示例#15
0
def process_link(link, msg, d):
    """
    Process an event with a link.
    :return: 
    """
    # ensure that the linked actor still exists; the link attribute is *always* the dbid of the linked
    # actor
    logger.debug("top of process_link")
    try:
        actors_store[link]
    except KeyError as e:
        logger.error(
            "Processing event message for actor {} that does not exist. Quiting"
            .format(link))
        raise e

    # create an execution for the linked actor with message
    exc = Execution.add_execution(
        link, {
            'cpu': 0,
            'io': 0,
            'runtime': 0,
            'status': SUBMITTED,
            'executor': 'Abaco Event'
        })
    logger.info(
        "Events processor agent added execution {} for actor {}".format(
            exc, link))
    d['_abaco_execution_id'] = exc
    logger.debug(
        "sending message to actor. Final message {} and message dictionary: {}"
        .format(msg, d))
    ch = ActorMsgChannel(actor_id=link)
    ch.put_msg(message=msg, d=d)
    ch.close()
    logger.info("link processed.")
示例#16
0
def create_gauges(actor_ids):
    """
    Creates a Prometheus gauge for each actor id. The gauge is used to track the number of
    pending messages in the actor's queue.
    :param actor_ids: list of actors that should be processed. Does not include stateful actors or
    actors in a shutting down state.
    :return:
    """
    logger.debug("top of create_gauges; actor_ids: {}".format(actor_ids))
    # dictionary mapping actor_ids to their message queue lengths
    inbox_lengths = {}
    for actor_id in actor_ids:
        logger.debug("top of for loop for actor_id: {}".format(actor_id))
        # first, make sure the actor still exists in the actor store
        try:
            actor = actors_store[actor_id]
        except KeyError:
            logger.error(
                f"actor {actor_id} does not exist in store; continuing to next actor."
            )
            continue
        # If the actor doesn't have a gauge, add one
        if actor_id not in message_gauges.keys():
            try:
                g = Gauge(
                    'message_count_for_actor_{}'.format(
                        actor_id.replace('-', '_')),
                    'Number of messages for actor {}'.format(
                        actor_id.replace('-', '_')))
                message_gauges.update({actor_id: g})
                logger.debug('Created gauge {}'.format(g))
            except Exception as e:
                logger.error(
                    "got exception trying to create/instantiate the gauge; "
                    "actor {}; exception: {}".format(actor_id, e))
                g = None
        else:
            # Otherwise, get this actor's existing gauge
            try:
                g = message_gauges[actor_id]
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate an existing gauge; "
                    "actor: {}: exception:{}".format(actor_id, e))
                g = None
        # Update this actor's gauge to its current # of messages
        try:
            ch = ActorMsgChannel(actor_id=actor_id)
            msg_length = len(ch._queue._queue)
        except Exception as e:
            logger.error(
                "Exception connecting to ActorMsgChannel: {}".format(e))
            raise e
        ch.close()
        result = {'messages': msg_length}
        # add the actor's current message queue length to the inbox_lengths in-memory variable
        inbox_lengths[actor_id] = msg_length
        # if we were able to create the gauge, set it to the current message:
        if g:
            try:
                g.set(result['messages'])
            except Exception as e:
                logger.error(
                    f"Got exception trying to set the messages on the gauge for actor: {actor_id}; "
                    f"exception: {e}")
        logger.debug("METRICS: {} messages found for actor: {}.".format(
            result['messages'], actor_id))

        # add a worker gauge for this actor if one does not exist
        if actor_id not in worker_gaueges.keys():
            try:
                g = Gauge(
                    'worker_count_for_actor_{}'.format(
                        actor_id.replace('-', '_')),
                    'Number of workers for actor {}'.format(
                        actor_id.replace('-', '_')))
                worker_gaueges.update({actor_id: g})
                logger.debug('Created worker gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Worker Gauge: {}".
                    format(e))
        else:
            # Otherwise, get the worker gauge that already exists
            g = worker_gaueges[actor_id]

        # Update this actor's worker IDs
        workers = Worker.get_workers(actor_id)
        result = {'workers': len(workers)}
        try:
            g.set(result['workers'])
        except Exception as e:
            logger.error(
                f"got exception trying to set the worker gauge for actor {actor_id}; exception: {e}"
            )
        logger.debug(
            f"METRICS: {result['workers']} workers found for actor: {actor_id}."
        )

        # Update this actor's command channel metric
        # channel_name = actor.get("queue")
        #
        # queues_list = Config.get('spawner', 'host_queues').replace(' ', '')
        # valid_queues = queues_list.split(',')
        #
        # if not channel_name or channel_name not in valid_queues:
        #     channel_name = 'default'
        #
        # if not channel_name:
        #     # TODO -- this must be changed. there is no way returning no arguments will result in
        #     # anythng but an exception. The calling function is expecting 3 arguments...
        #     # if we really want to blow up right here we should just raise an appropriate exception.
        #     return

    # TODO -- this code needs to be fixed. What follows is only a partial fix; what I think we want to do
    # is set the length of all of the different command channels once at the end of this loop. What was
    # happening instead was that it was only setting one of the command channel's lengths -- whatever command
    # channel happened to belong to the last actor in the loop.
    channel_name = 'default'
    ch = CommandChannel(name=channel_name)
    cmd_length = len(ch._queue._queue)
    command_gauge.labels(channel_name).set(cmd_length)
    logger.debug(
        f"METRICS COMMAND CHANNEL {channel_name} size: {command_gauge}")
    ch.close()

    # Return actor_ids so we don't have to query for them again later
    return actor_ids, inbox_lengths, cmd_length
示例#17
0
def create_gauges(actor_ids):
    logger.debug(
        "METRICS: Made it to create_gauges; actor_ids: {}".format(actor_ids))
    inbox_lengths = {}
    for actor_id in actor_ids:
        logger.debug("top of for loop for actor_id: {}".format(actor_id))

        try:
            actor = actors_store[actor_id]
        except KeyError:
            logger.error("actor {} does not exist.".format(actor_id))
            continue

            # If the actor doesn't have a gauge, add one
        if actor_id not in message_gauges.keys():
            try:
                g = Gauge(
                    'message_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of messages for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                message_gauges.update({actor_id: g})
                logger.debug('Created gauge {}'.format(g))
            except Exception as e:
                logger.error(
                    "got exception trying to create/instantiate the gauge; "
                    "actor {}; exception: {}".format(actor_id, e))
        else:
            # Otherwise, get this actor's existing gauge
            try:
                g = message_gauges[actor_id]
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate an existing gauge; "
                    "actor: {}: exception:{}".format(actor_id, e))

            # Update this actor's command channel metric
            channel_name = actor.get("queue")

            queues_list = Config.get('spawner', 'host_queues').replace(' ', '')
            valid_queues = queues_list.split(',')

            if not channel_name or channel_name not in valid_queues:
                channel_name = 'default'

        # Update this actor's gauge to its current # of messages
        try:
            ch = ActorMsgChannel(actor_id=actor_id.decode("utf-8"))
        except Exception as e:
            logger.error(
                "Exception connecting to ActorMsgChannel: {}".format(e))
            raise e
        result = {'messages': len(ch._queue._queue)}
        inbox_lengths[actor_id.decode("utf-8")] = len(ch._queue._queue)
        ch.close()
        g.set(result['messages'])
        logger.debug("METRICS: {} messages found for actor: {}.".format(
            result['messages'], actor_id))

        # add a worker gauge for this actor if one does not exist
        if actor_id not in worker_gaueges.keys():
            try:
                g = Gauge(
                    'worker_count_for_actor_{}'.format(
                        actor_id.decode("utf-8").replace('-', '_')),
                    'Number of workers for actor {}'.format(
                        actor_id.decode("utf-8").replace('-', '_')))
                worker_gaueges.update({actor_id: g})
                logger.debug('Created worker gauge {}'.format(g))
            except Exception as e:
                logger.info(
                    "got exception trying to instantiate the Worker Gauge: {}".
                    format(e))
        else:
            # Otherwise, get the worker gauge that already exists
            g = worker_gaueges[actor_id]

        # Update this actor's worker IDs
        workers = Worker.get_workers(actor_id)
        result = {'workers': len(workers)}
        g.set(result['workers'])

    ch = CommandChannel(name=channel_name)
    cmd_length = len(ch._queue._queue)
    command_gauge.labels(channel_name).set(cmd_length)
    logger.debug("METRICS COMMAND CHANNEL {} size: {}".format(
        channel_name, command_gauge))
    ch.close()

    # Return actor_ids so we don't have to query for them again later
    return actor_ids, inbox_lengths, cmd_length
示例#18
0
    def post(self, actor_id):
        def get_hypermedia(actor, exc):
            return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc),
                               'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                               'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},}

        logger.debug("top of POST /actors/{}/messages.".format(actor_id))
        dbid = Actor.get_dbid(g.tenant, actor_id)
        try:
            Actor.from_db(actors_store[dbid])
        except KeyError:
            logger.debug("did not find actor: {}.".format(actor_id))
            raise ResourceError("No actor found with id: {}.".format(actor_id), 404)
        args = self.validate_post()
        d = {}
        # build a dictionary of k:v pairs from the query parameters, and pass a single
        # additional object 'message' from within the post payload. Note that 'message'
        # need not be JSON data.
        logger.debug("POST body validated. actor: {}.".format(actor_id))
        for k, v in request.args.items():
            if k == 'message':
                continue
            d[k] = v
        logger.debug("extra fields added to message from query parameters: {}.".format(d))
        if hasattr(g, 'user'):
            d['_abaco_username'] = g.user
            logger.debug("_abaco_username: {} added to message.".format(g.user))
        if hasattr(g, 'api_server'):
            d['_abaco_api_server'] = g.api_server
            logger.debug("_abaco_api_server: {} added to message.".format(g.api_server))
        # if hasattr(g, 'jwt'):
        #     d['_abaco_jwt'] = g.jwt
        # if hasattr(g, 'jwt_server'):
        #     d['_abaco_jwt_server'] = g.jwt_server
        if hasattr(g, 'jwt_header_name'):
            d['_abaco_jwt_header_name'] = g.jwt_header_name
            logger.debug("abaco_jwt_header_name: {} added to message.".format(g.jwt_header_name))

        # create an execution
        exc = Execution.add_execution(dbid, {'cpu': 0,
                                             'io': 0,
                                             'runtime': 0,
                                             'status': SUBMITTED,
                                             'executor': g.user})
        logger.info("Execution {} added for actor {}".format(exc, actor_id))
        d['_abaco_execution_id'] = exc
        d['_abaco_Content_Type'] = args.get('_abaco_Content_Type', '')
        logger.debug("Final message dictionary: {}".format(d))
        ch = ActorMsgChannel(actor_id=dbid)
        ch.put_msg(message=args['message'], d=d)
        ch.close()
        logger.debug("Message added to actor inbox. id: {}.".format(actor_id))
        # make sure at least one worker is available
        actor = Actor.from_db(actors_store[dbid])
        actor.ensure_one_worker()
        logger.debug("ensure_one_worker() called. id: {}.".format(actor_id))
        if args.get('_abaco_Content_Type') == 'application/octet-stream':
            result = {'execution_id': exc, 'msg': 'binary - omitted'}
        else:
            result={'execution_id': exc, 'msg': args['message']}
        result.update(get_hypermedia(actor, exc))
        case = Config.get('web', 'case')
        if not case == 'camel':
            return ok(result)
        else:
            return ok(dict_to_camel(result))
示例#19
0
def subscribe(tenant,
              actor_id,
              worker_id,
              api_server,
              client_id,
              client_secret,
              access_token,
              refresh_token,
              worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    logger.debug("Top of subscribe().")
    actor_ch = ActorMsgChannel(actor_id)
    try:
        leave_containers = Config.get('workers', 'leave_containers')
    except configparser.NoOptionError:
        logger.info("No leave_containers value confiured.")
        leave_containers = False
    if hasattr(leave_containers, 'lower'):
        leave_containers = leave_containers.lower() == "true"
    logger.info("leave_containers: {}".format(leave_containers))
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        logger.info("Creating agave client.")
        verify = get_tenant_verify(tenant)
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret,
                   verify=verify)
    else:
        logger.info("Not creating agave client.")
    logger.info("Starting the process worker channel thread.")
    t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag))
    t.start()
    logger.info("Worker subscribing to actor channel.")

    # keep track of whether we need to update the worker's status back to READY; otherwise, we
    # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s)
    update_worker_status = True

    # shared global tracking whether this worker should keep running; shared between this thread and
    # the "worker channel processing" thread.
    global keep_running

    # main subscription loop -- processing messages from actor's mailbox
    while keep_running:
        if update_worker_status:
            Worker.update_worker_status(actor_id, worker_id, READY)
            update_worker_status = False
        try:
            msg = actor_ch.get_one()
        except channelpy.ChannelClosedException:
            logger.info("Channel closed, worker exiting...")
            keep_running = False
            sys.exit()
        logger.info("worker {} processing new msg.".format(worker_id))
        try:
            Worker.update_worker_status(actor_id, worker_id, BUSY)
        except Exception as e:
            logger.error("unexpected exception from call to update_worker_status."
                         "actor_id: {}; worker_id: {}; status: {}; exception: {}".format(actor_id,
                                                                                         worker_id,
                                                                                         BUSY,
                                                                                         e))
            raise e
        update_worker_status = True
        logger.info("Received message {}. Starting actor container...".format(msg))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        actor = Actor.from_db(actors_store[actor_id])
        execution_id = msg['_abaco_execution_id']
        content_type = msg['_abaco_Content_Type']
        mounts = actor.mounts
        logger.debug("actor mounts: {}".format(mounts))
        # for results, create a socket in the configured directory.
        try:
            socket_host_path_dir = Config.get('workers', 'socket_host_path_dir')
        except (configparser.NoSectionError, configparser.NoOptionError):
            logger.error("No socket_host_path configured. Cannot manage results data.")
            Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for results data.")
            continue
        socket_host_path = '{}.sock'.format(os.path.join(socket_host_path_dir, worker_id, execution_id))
        logger.info("Create socket at path: {}".format(socket_host_path))
        # add the socket as a mount:
        mounts.append({'host_path': socket_host_path,
                       'container_path': '/_abaco_results.sock',
                       'format': 'ro'})
        # for binary data, create a fifo in the configured directory. The configured
        # fifo_host_path_dir is equal to the fifo path in the worker container:
        fifo_host_path = None
        if content_type == 'application/octet-stream':
            try:
                fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir')
            except (configparser.NoSectionError, configparser.NoOptionError):
                logger.error("No fifo_host_path configured. Cannot manage binary data.")
                Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for binary data.")
                continue
            fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id)
            try:
                os.mkfifo(fifo_host_path)
                logger.info("Created fifo at path: {}".format(fifo_host_path))
            except Exception as e:
                logger.error("Could not create fifo_path. Exception: {}".format(e))
                raise e
            # add the fifo as a mount:
            mounts.append({'host_path': fifo_host_path,
                           'container_path': '/_abaco_binary_data',
                           'format': 'ro'})

        # the execution object was created by the controller, but we need to add the worker id to it now that we
        # know which worker will be working on the execution.
        logger.debug("Adding worker_id to execution.")
        Execution.add_worker_id(actor_id, execution_id, worker_id)

        # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted.
        privileged = False
        if type(actor['privileged']) == bool and actor['privileged']:
            privileged = True
        logger.debug("privileged: {}".format(privileged))

        # retrieve the default environment registered with the actor.
        environment = actor['default_environment']
        logger.debug("Actor default environment: {}".format(environment))

        # construct the user field from the actor's uid and gid:
        user = get_container_user(actor)
        logger.debug("Final user valiue: {}".format(user))
        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        environment['_abaco_actor_dbid'] = actor_id
        environment['_abaco_actor_id'] = actor.id
        environment['_abaco_actor_state'] = actor.state
        logger.debug("Overlayed environment: {}".format(environment))

        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                logger.info("Refreshed the tokens. Passed {} to the environment.".format(token))
            except Exception as e:
                logger.error("Got an exception trying to get an access token: {}".format(e))
        else:
            logger.info("Agave client `ag` is None -- not passing access token.")
        logger.info("Passing update environment: {}".format(environment))
        try:
            stats, logs, final_state, exit_code, start_time = execute_actor(actor_id,
                                                                            worker_id,
                                                                            execution_id,
                                                                            image,
                                                                            message,
                                                                            user,
                                                                            environment,
                                                                            privileged,
                                                                            mounts,
                                                                            leave_containers,
                                                                            fifo_host_path,
                                                                            socket_host_path)
        except DockerStartContainerError as e:
            logger.error("Got DockerStartContainerError: {}".format(e))
            Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e))
            continue
        # Add the completed stats to the execution
        logger.info("Actor container finished successfully. Got stats object:{}".format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time)
        logger.info("Added execution: {}".format(execution_id))

        # Add the logs to the execution
        Execution.set_logs(execution_id, logs)
        logger.info("Added execution logs.")

        # Update the worker's last updated and last execution fields:
        try:
            Worker.update_worker_execution_time(actor_id, worker_id)
        except KeyError:
            # it is possible that this worker was sent a gracful shutdown command in the other thread
            # and that spawner has already removed this worker from the store.
            logger.info("worker {} got unexpected key error trying to update its execution time. "
                        "Worker better be shutting down! keep_running: {}".format(worker_id, keep_running))
            if keep_running:
                logger.error("worker couldn't update's its execution time but keep_running is still true!")

        logger.info("worker time stamps updated.")
示例#20
0
文件: worker.py 项目: TACC/abaco
def subscribe(tenant,
              actor_id,
              api_server,
              client_id,
              client_secret,
              access_token,
              refresh_token,
              worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    actor_ch = ActorMsgChannel(actor_id)
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret)
    else:
        print("Not creating agave client.")
    t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, actor_ch, ag))
    t.start()
    print("Worker subscribing to actor channel...")
    global keep_running
    while keep_running:
        Worker.update_worker_status(actor_id, worker_ch.name, READY)
        try:
            msg = actor_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        except channelpy.ChannelClosedException:
            print("Channel closed, worker exiting...")
            keep_running = False
            sys.exit()
        print("Received message {}. Starting actor container...".format(str(msg)))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        actor = Actor.from_db(actors_store[actor_id])
        execution_id = msg['_abaco_execution_id']
        privileged = False
        if actor['privileged'] == 'TRUE':
            privileged = True
        environment = actor['default_environment']
        print("Actor default environment: {}".format(environment))
        print("Actor privileged: {}".format(privileged))
        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                print("Refreshed the tokens. Passed {} to the environment.".format(token))
            except Exception as e:
                print("Got an exception trying to get an access token: {}".format(e))
        else:
            print("Agave client `ag` is None -- not passing access token.")
        print("Passing update environment: {}".format(environment))
        try:
            stats, logs = execute_actor(actor_id, worker_ch, image, message,
                                        environment, privileged)
        except DockerStartContainerError as e:
            print("Got DockerStartContainerError: {}".format(str(e)))
            Actor.set_status(actor_id, ERROR)
            continue
        # add the execution to the actor store
        print("Actor container finished successfully. Got stats object:{}".format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats)
        print("Added execution: {}".format(execution_id))
        Execution.set_logs(execution_id, logs)
        Worker.update_worker_execution_time(actor_id, worker_ch.name)
示例#21
0
def subscribe(tenant, actor_id, worker_id, api_server, client_id,
              client_secret, access_token, refresh_token, worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    logger.debug("Top of subscribe().")
    actor_ch = ActorMsgChannel(actor_id)
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        logger.info("Creating agave client.")
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret)
    else:
        logger.info("Not creating agave client.")
    logger.info("Starting the process worker channel thread.")
    t = threading.Thread(target=process_worker_ch,
                         args=(tenant, worker_ch, actor_id, worker_id,
                               actor_ch, ag))
    t.start()
    logger.info("Worker subscribing to actor channel.")
    update_worker_status = True
    global keep_running
    while keep_running:
        if update_worker_status:
            Worker.update_worker_status(actor_id, worker_id, READY)
            update_worker_status = False
        try:
            msg = actor_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        except channelpy.ChannelClosedException:
            logger.info("Channel closed, worker exiting...")
            keep_running = False
            sys.exit()
        update_worker_status = True
        logger.info(
            "Received message {}. Starting actor container...".format(msg))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        actor = Actor.from_db(actors_store[actor_id])
        execution_id = msg['_abaco_execution_id']

        # the execution object was created by the controller, but we need to add the worker id to it now that we
        # know which worker will be working on the execution.
        logger.debug("Adding worker_id to execution.")
        Execution.add_worker_id(actor_id, execution_id, worker_id)

        # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted.
        privileged = False
        if actor['privileged'] == 'TRUE':
            privileged = True
        logger.debug("privileged: {}".format(privileged))

        # retrieve the default environment registered with the actor.
        environment = actor['default_environment']
        logger.debug("Actor default environment: {}".format(environment))

        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        environment['_abaco_actor_dbid'] = actor_id
        environment['_abaco_actor_id'] = actor.id
        environment['_abaco_actor_state'] = actor.state
        logger.debug("Overlayed environment: {}".format(environment))

        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                logger.info(
                    "Refreshed the tokens. Passed {} to the environment.".
                    format(token))
            except Exception as e:
                logger.error(
                    "Got an exception trying to get an access token: {}".
                    format(e))
        else:
            logger.info(
                "Agave client `ag` is None -- not passing access token.")
        logger.info("Passing update environment: {}".format(environment))
        try:
            stats, logs, final_state, exit_code = execute_actor(
                actor_id, worker_id, worker_ch, image, message, environment,
                privileged)
        except DockerStartContainerError as e:
            logger.error("Got DockerStartContainerError: {}".format(str(e)))
            Actor.set_status(actor_id, ERROR)
            continue
        # Add the completed stats to the execution
        logger.info(
            "Actor container finished successfully. Got stats object:{}".
            format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats,
                                     final_state, exit_code)
        logger.info("Added execution: {}".format(execution_id))

        # Add the logs to the execution
        Execution.set_logs(execution_id, logs)
        logger.info("Added execution logs.")

        # Update the worker's last updated and last execution fields:
        Worker.update_worker_execution_time(actor_id, worker_id)
        logger.info("worker time stamps updated.")
示例#22
0
def subscribe(tenant, actor_id, image, worker_id, api_server, client_id,
              client_secret, access_token, refresh_token, worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also launches a separate thread which ultimately subscribes to the worker channel
    for future communications.
    :return:
    """
    logger.debug("Top of subscribe(). worker_id: {}".format(worker_id))
    actor_ch = ActorMsgChannel(actor_id)
    # establish configs for this worker -------
    try:
        leave_containers = Config.get('workers', 'leave_containers')
    except configparser.NoOptionError:
        logger.debug("No leave_containers value configured.")
        leave_containers = False
    if hasattr(leave_containers, 'lower'):
        leave_containers = leave_containers.lower() == "true"
    logger.debug("leave_containers: {}".format(leave_containers))

    try:
        mem_limit = Config.get('workers', 'mem_limit')
    except configparser.NoOptionError:
        logger.debug("No mem_limit value configured.")
        mem_limit = "-1"
    mem_limit = str(mem_limit)

    try:
        max_cpus = Config.get('workers', 'max_cpus')
    except configparser.NoOptionError:
        logger.debug("No max_cpus value configured.")
        max_cpus = "-1"

    logger.debug("max_cpus: {}".format(max_cpus))

    # instantiate an OAuth client python object if credentials were passed -----
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        logger.info("Creating agave client.")
        verify = get_tenant_verify(tenant)
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret,
                   verify=verify)
    else:
        logger.info("Not creating agave client.")

    # start a separate thread for handling messages sent to the worker channel ----
    logger.info("Starting the process worker channel thread.")
    t = threading.Thread(target=process_worker_ch,
                         args=(tenant, worker_ch, actor_id, worker_id,
                               actor_ch, ag))
    t.start()

    # subscribe to the actor message queue -----
    logger.info(
        "Worker subscribing to actor channel. worker_id: {}".format(worker_id))
    # keep track of whether we need to update the worker's status back to READY; otherwise, we
    # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s)
    update_worker_status = True

    # global tracks whether this worker should keep running.
    globals.keep_running = True

    # consecutive_errors tracks the number of consecutive times a worker has gotten an error trying to process a
    # message. Even though the message will be requeued, we do not want the worker to continue processing
    # indefinitely when a compute node is unhealthy.
    consecutive_errors = 0

    # main subscription loop -- processing messages from actor's mailbox
    while globals.keep_running:
        logger.debug("top of keep_running; worker id: {}".format(worker_id))
        if update_worker_status:
            Worker.update_worker_status(actor_id, worker_id, READY)
            logger.debug(
                "updated worker status to READY in SUBSCRIBE; worker id: {}".
                format(worker_id))
            update_worker_status = False
        try:
            msg, msg_obj = actor_ch.get_one()
        except channelpy.ChannelClosedException:
            logger.info("Channel closed, worker exiting. worker id: {}".format(
                worker_id))
            globals.keep_running = False
            sys.exit()
        logger.info("worker {} processing new msg.".format(worker_id))

        try:
            Worker.update_worker_status(actor_id, worker_id, BUSY)
        except Exception as e:
            logger.error(
                "unexpected exception from call to update_worker_status. Nacking message."
                "actor_id: {}; worker_id: {}; status: {}; exception: {}".
                format(actor_id, worker_id, BUSY, e))
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            msg_obj.nack(requeue=True)
            raise e
        update_worker_status = True
        logger.info(
            "Received message {}. Starting actor container. worker id: {}".
            format(msg, worker_id))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        try:
            actor = Actor.from_db(actors_store[actor_id])
            execution_id = msg['_abaco_execution_id']
            content_type = msg['_abaco_Content_Type']
            mounts = actor.mounts
            logger.debug("actor mounts: {}".format(mounts))
        except Exception as e:
            logger.error(
                "unexpected exception retrieving actor, execution, content-type, mounts. Nacking message."
                "actor_id: {}; worker_id: {}; status: {}; exception: {}".
                format(actor_id, worker_id, BUSY, e))
            msg_obj.nack(requeue=True)
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            raise e

        # for results, create a socket in the configured directory.
        try:
            socket_host_path_dir = Config.get('workers',
                                              'socket_host_path_dir')
        except (configparser.NoSectionError, configparser.NoOptionError) as e:
            logger.error(
                "No socket_host_path configured. Cannot manage results data. Nacking message"
            )
            Actor.set_status(
                actor_id,
                ERROR,
                status_message="Abaco instance not configured for results data."
            )
            msg_obj.nack(requeue=True)
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            raise e
        socket_host_path = '{}.sock'.format(
            os.path.join(socket_host_path_dir, worker_id, execution_id))
        logger.info("Create socket at path: {}".format(socket_host_path))
        # add the socket as a mount:
        mounts.append({
            'host_path': socket_host_path,
            'container_path': '/_abaco_results.sock',
            'format': 'ro'
        })
        # for binary data, create a fifo in the configured directory. The configured
        # fifo_host_path_dir is equal to the fifo path in the worker container:
        fifo_host_path = None
        if content_type == 'application/octet-stream':
            try:
                fifo_host_path_dir = Config.get('workers',
                                                'fifo_host_path_dir')
            except (configparser.NoSectionError,
                    configparser.NoOptionError) as e:
                logger.error(
                    "No fifo_host_path configured. Cannot manage binary data.")
                Actor.set_status(
                    actor_id,
                    ERROR,
                    status_message=
                    "Abaco instance not configured for binary data. Nacking message."
                )
                msg_obj.nack(requeue=True)
                logger.info("worker exiting. worker_id: {}".format(worker_id))
                raise e
            fifo_host_path = os.path.join(fifo_host_path_dir, worker_id,
                                          execution_id)
            try:
                os.mkfifo(fifo_host_path)
                logger.info("Created fifo at path: {}".format(fifo_host_path))
            except Exception as e:
                logger.error(
                    "Could not create fifo_path. Nacking message. Exception: {}"
                    .format(e))
                msg_obj.nack(requeue=True)
                logger.info("worker exiting. worker_id: {}".format(worker_id))
                raise e
            # add the fifo as a mount:
            mounts.append({
                'host_path': fifo_host_path,
                'container_path': '/_abaco_binary_data',
                'format': 'ro'
            })

        # the execution object was created by the controller, but we need to add the worker id to it now that we
        # know which worker will be working on the execution.
        logger.debug(
            "Adding worker_id to execution. woker_id: {}".format(worker_id))
        try:
            Execution.add_worker_id(actor_id, execution_id, worker_id)
        except Exception as e:
            logger.error(
                "Unexpected exception adding working_id to the Execution. Nacking message. Exception: {}"
                .format(e))
            msg_obj.nack(requeue=True)
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            raise e

        # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted.
        privileged = False
        if type(actor['privileged']) == bool and actor['privileged']:
            privileged = True
        logger.debug("privileged: {}; worker_id: {}".format(
            privileged, worker_id))

        # overlay resource limits if set on actor:
        if actor.mem_limit:
            mem_limit = actor.mem_limit
        if actor.max_cpus:
            max_cpus = actor.max_cpus

        # retrieve the default environment registered with the actor.
        environment = actor['default_environment']
        logger.debug("Actor default environment: {}".format(environment))

        # construct the user field from the actor's uid and gid:
        user = get_container_user(actor)
        logger.debug("Final user valiue: {}".format(user))
        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        environment['_abaco_actor_dbid'] = actor_id
        environment['_abaco_actor_id'] = actor.id
        environment['_abaco_worker_id'] = worker_id
        environment['_abaco_container_repo'] = actor.image
        environment['_abaco_actor_state'] = actor.state
        environment['_abaco_actor_name'] = actor.name or 'None'
        logger.debug("Overlayed environment: {}; worker_id: {}".format(
            environment, worker_id))

        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                logger.info(
                    "Refreshed the tokens. Passed {} to the environment.".
                    format(token))
            except Exception as e:
                logger.error(
                    "Got an exception trying to get an access token. Stoping worker and nacking message. "
                    "Exception: {}".format(e))
                msg_obj.nack(requeue=True)
                logger.info("worker exiting. worker_id: {}".format(worker_id))
                raise e
        else:
            logger.info(
                "Agave client `ag` is None -- not passing access token; worker_id: {}"
                .format(worker_id))
        logger.info("Passing update environment: {}".format(environment))
        logger.info("About to execute actor; worker_id: {}".format(worker_id))
        try:
            stats, logs, final_state, exit_code, start_time = execute_actor(
                actor_id, worker_id, execution_id, image, message, user,
                environment, privileged, mounts, leave_containers,
                fifo_host_path, socket_host_path, mem_limit, max_cpus)
        except DockerStartContainerError as e:
            logger.error(
                "Worker {} got DockerStartContainerError: {} trying to start actor for execution {}."
                "Placing message back on queue.".format(
                    worker_id, e, execution_id))
            # if we failed to start the actor container, we leave the worker up and re-queue the original message
            msg_obj.nack(requeue=True)
            logger.debug('message requeued.')
            consecutive_errors += 1
            if consecutive_errors > MAX_WORKER_CONSECUTIVE_ERRORS:
                logger.error(
                    "Worker {} failed to successfully start actor for execution {} {} consecutive times; "
                    "Exception: {}. Putting the actor in error status and shutting "
                    "down workers.".format(worker_id, execution_id,
                                           MAX_WORKER_CONSECUTIVE_ERRORS, e))
                Actor.set_status(actor_id, ERROR,
                                 "Error executing container: {}; w".format(e))
                shutdown_workers(actor_id, delete_actor_ch=False)
                # wait for worker to be shutdown..
                time.sleep(60)
                break
            else:
                # sleep five seconds before getting a message again to give time for the compute
                # node and/or docker health to recover
                time.sleep(5)
                continue
        except DockerStopContainerError as e:
            logger.error(
                "Worker {} was not able to stop actor for execution: {}; Exception: {}. "
                "Putting the actor in error status and shutting down workers.".
                format(worker_id, execution_id, e))
            Actor.set_status(actor_id, ERROR,
                             "Error executing container: {}".format(e))
            # since the error was with stopping the actor, we will consider this message "processed"; this choice
            # could be reconsidered/changed
            msg_obj.ack()
            shutdown_workers(actor_id, delete_actor_ch=False)
            # wait for worker to be shutdown..
            time.sleep(60)
            break
        except Exception as e:
            logger.error(
                "Worker {} got an unexpected exception trying to run actor for execution: {}."
                "Putting the actor in error status and shutting down workers. "
                "Exception: {}; type: {}".format(worker_id, execution_id, e,
                                                 type(e)))
            Actor.set_status(actor_id, ERROR,
                             "Error executing container: {}".format(e))
            # the execute_actor function raises a DockerStartContainerError if it met an exception before starting the
            # actor container; if the container was started, then another exception should be raised. Therefore,
            # we can assume here that the container was at least started and we can ack the message.
            msg_obj.ack()
            shutdown_workers(actor_id, delete_actor_ch=False)
            # wait for worker to be shutdown..
            time.sleep(60)
            break
        # ack the message
        msg_obj.ack()
        logger.debug(
            "container finished successfully; worker_id: {}".format(worker_id))
        # Add the completed stats to the execution
        logger.info(
            "Actor container finished successfully. Got stats object:{}".
            format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats,
                                     final_state, exit_code, start_time)
        logger.info("Added execution: {}; worker_id: {}".format(
            execution_id, worker_id))

        # Add the logs to the execution
        try:
            Execution.set_logs(execution_id, logs)
            logger.debug("Successfully added execution logs.")
        except Exception as e:
            msg = "Got exception trying to set logs for exception {}; " \
                  "Exception: {}; worker_id: {}".format(execution_id, e, worker_id)
            logger.error(msg)

        # Update the worker's last updated and last execution fields:
        try:
            Worker.update_worker_execution_time(actor_id, worker_id)
            logger.debug("worker execution time updated. worker_id: {}".format(
                worker_id))
        except KeyError:
            # it is possible that this worker was sent a gracful shutdown command in the other thread
            # and that spawner has already removed this worker from the store.
            logger.info(
                "worker {} got unexpected key error trying to update its execution time. "
                "Worker better be shutting down! keep_running: {}".format(
                    worker_id, globals.keep_running))
            if globals.keep_running:
                logger.error(
                    "worker couldn't update's its execution time but keep_running is still true!"
                )

        # we completed an execution successfully; reset the consecutive_errors counter
        consecutive_errors = 0
        logger.info(
            "worker time stamps updated; worker_id: {}".format(worker_id))
    logger.info(
        "global.keep_running no longer true. worker is now exited. worker id: {}"
        .format(worker_id))
示例#23
0
文件: worker.py 项目: mwvaughn/abaco
def subscribe(tenant, actor_id, worker_id, api_server, client_id,
              client_secret, access_token, refresh_token, worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    actor_ch = ActorMsgChannel(actor_id)
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret)
    else:
        print("Not creating agave client.")
    t = threading.Thread(target=process_worker_ch,
                         args=(tenant, worker_ch, actor_id, worker_id,
                               actor_ch, ag))
    t.start()
    print("Worker subscribing to actor channel...")
    global keep_running
    while keep_running:
        Worker.update_worker_status(actor_id, worker_id, READY)
        try:
            msg = actor_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        except channelpy.ChannelClosedException:
            print("Channel closed, worker exiting...")
            keep_running = False
            sys.exit()
        print("Received message {}. Starting actor container...".format(
            str(msg)))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        actor = Actor.from_db(actors_store[actor_id])
        execution_id = msg['_abaco_execution_id']
        Execution.add_worker_id(actor_id, execution_id, worker_id)
        privileged = False
        if actor['privileged'] == 'TRUE':
            privileged = True
        environment = actor['default_environment']
        print("Actor default environment: {}".format(environment))
        print("Actor privileged: {}".format(privileged))
        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        environment['_abaco_actor_dbid'] = actor_id
        environment['_abaco_actor_id'] = actor.id
        environment['_abaco_actor_state'] = actor.state
        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                print("Refreshed the tokens. Passed {} to the environment.".
                      format(token))
            except Exception as e:
                print("Got an exception trying to get an access token: {}".
                      format(e))
        else:
            print("Agave client `ag` is None -- not passing access token.")
        print("Passing update environment: {}".format(environment))
        try:
            stats, logs, final_state, exit_code = execute_actor(
                actor_id, worker_id, worker_ch, image, message, environment,
                privileged)
        except DockerStartContainerError as e:
            print("Got DockerStartContainerError: {}".format(str(e)))
            Actor.set_status(actor_id, ERROR)
            continue
        # add the execution to the actor store
        print("Actor container finished successfully. Got stats object:{}".
              format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats,
                                     final_state, exit_code)
        print("Added execution: {}".format(execution_id))
        Execution.set_logs(execution_id, logs)
        Worker.update_worker_execution_time(actor_id, worker_id)