Exemplo n.º 1
0
def execute_actor(actor_id,
                  worker_id,
                  execution_id,
                  image,
                  msg,
                  user=None,
                  d={},
                  privileged=False,
                  mounts=[],
                  leave_container=False,
                  fifo_host_path=None,
                  socket_host_path=None,
                  mem_limit=None,
                  max_cpus=None):
    """
    Creates and runs an actor container and supervises the execution, collecting statistics about resource consumption
    from the Docker daemon.

    :param actor_id: the dbid of the actor; for updating worker status
    :param worker_id: the worker id; also for updating worker status
    :param execution_id: the id of the execution.
    :param image: the actor's image; worker must have already downloaded this image to the local docker registry.
    :param msg: the message being passed to the actor.
    :param user: string in the form {uid}:{gid} representing the uid and gid to run the command as.
    :param d: dictionary representing the environment to instantiate within the actor container.
    :param privileged: whether this actor is "privileged"; i.e., its container should run in privileged mode with the
    docker daemon mounted.
    :param mounts: list of dictionaries representing the mounts to add; each dictionary mount should have 3 keys:
    host_path, container_path and format (which should have value 'ro' or 'rw').
    :param fifo_host_path: If not None, a string representing a path on the host to a FIFO used for passing binary data to the actor.
    :param socket_host_path: If not None, a string representing a path on the host to a socket used for collecting results from the actor.
    :param mem_limit: The maximum amount of memory the Actor container can use; should be the same format as the --memory Docker flag.
    :param max_cpus: The maximum number of CPUs each actor will have available to them. Does not guarantee these CPU resources; serves as upper bound.
    :return: result (dict), logs (str) - `result`: statistics about resource consumption; `logs`: output from docker logs.
    """
    logger.debug("top of execute_actor(); (worker {};{})".format(
        worker_id, execution_id))

    # initially set the global force_quit variable to False
    globals.force_quit = False

    # initial stats object, environment, binds and volumes
    result = {'cpu': 0, 'io': 0, 'runtime': 0}

    # instantiate docker client
    cli = docker.APIClient(base_url=dd, version="auto")

    # don't try to pass binary messages through the environment as these can cause
    # broken pipe errors. the binary data will be passed through the FIFO momentarily.
    if not fifo_host_path:
        d['MSG'] = msg
    binds = {}
    volumes = []

    # if container is privileged, mount the docker daemon so that additional
    # containers can be started.
    logger.debug("privileged: {};(worker {};{})".format(
        privileged, worker_id, execution_id))
    if privileged:
        binds = {
            '/var/run/docker.sock': {
                'bind': '/var/run/docker.sock',
                'ro': False
            }
        }
        volumes = ['/var/run/docker.sock']

    # add a bind key and dictionary as well as a volume for each mount
    for m in mounts:
        binds[m.get('host_path')] = {
            'bind': m.get('container_path'),
            'ro': m.get('format') == 'ro'
        }
        volumes.append(m.get('host_path'))

    # mem_limit
    # -1 => unlimited memory
    if mem_limit == '-1':
        mem_limit = None

    # max_cpus
    try:
        max_cpus = int(max_cpus)
    except:
        max_cpus = None
    # -1 => unlimited cpus
    if max_cpus == -1:
        max_cpus = None

    host_config = cli.create_host_config(binds=binds,
                                         privileged=privileged,
                                         mem_limit=mem_limit,
                                         nano_cpus=max_cpus)
    logger.debug("host_config object created by (worker {};{}).".format(
        worker_id, execution_id))

    # write binary data to FIFO if it exists:
    if fifo_host_path:
        try:
            fifo = os.open(fifo_host_path, os.O_RDWR)
            os.write(fifo, msg)
        except Exception as e:
            logger.error(
                "Error writing the FIFO. Exception: {};(worker {};{})".format(
                    e, worker_id, execution_id))
            os.remove(fifo_host_path)
            raise DockerStartContainerError("Error writing to fifo: {}; "
                                            "(worker {};{})".format(
                                                e, worker_id, execution_id))

    # set up results socket -----------------------
    # make sure socket doesn't already exist:
    try:
        os.unlink(socket_host_path)
    except OSError as e:
        if os.path.exists(socket_host_path):
            logger.error(
                "socket at {} already exists; Exception: {}; (worker {};{})".
                format(socket_host_path, e, worker_id, execution_id))
            raise DockerStartContainerError(
                "Got an OSError trying to create the results docket; "
                "exception: {}".format(e))

    # use retry logic since, when the compute node is under load, we see errors initially trying to create the socket
    # server object.
    keep_trying = True
    count = 0
    server = None
    while keep_trying and count < 10:
        keep_trying = False
        count = count + 1
        try:
            server = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
        except Exception as e:
            keep_trying = True
            logger.info("Could not instantiate socket at {}. "
                        "Count: {}; Will keep trying. "
                        "Exception: {}; type: {}; (worker {};{})".format(
                            socket_host_path, count, e, type(e), worker_id,
                            execution_id))
        try:
            server.bind(socket_host_path)
        except Exception as e:
            keep_trying = True
            logger.info("Could not bind socket at {}. "
                        "Count: {}; Will keep trying. "
                        "Exception: {}; type: {}; (worker {};{})".format(
                            socket_host_path, count, e, type(e), worker_id,
                            execution_id))
        try:
            os.chmod(socket_host_path, 0o777)
            logger.debug(
                "results socket permissions set to 777. socket_host_path: {}".
                format(socket_host_path))
        except Exception as e:
            msg = f"Got exception trying to set permissions on the results socket. Not sure what to do. e: {e}"
            logger.error(msg)
            # for now, we'll just swallow it but this is really a TODO.

        try:
            server.settimeout(RESULTS_SOCKET_TIMEOUT)
        except Exception as e:
            keep_trying = True
            logger.info("Could not set timeout for socket at {}. "
                        "Count: {}; Will keep trying. "
                        "Exception: {}; type: {}; (worker {};{})".format(
                            socket_host_path, count, e, type(e), worker_id,
                            execution_id))
    if not server:
        msg = "Failed to instantiate results socket. " \
              "Abaco compute host could be overloaded. Exception: {}; (worker {};{})".format(e, worker_id, execution_id)
        logger.error(msg)
        raise DockerStartContainerError(msg)

    logger.debug(
        "results socket server instantiated. path: {} (worker {};{})".format(
            socket_host_path, worker_id, execution_id))

    # instantiate the results channel:
    results_ch = ExecutionResultsChannel(actor_id, execution_id)

    # create and start the container
    logger.debug("Final container environment: {};(worker {};{})".format(
        d, worker_id, execution_id))
    logger.debug(
        "Final binds: {} and host_config: {} for the container.(worker {};{})".
        format(binds, host_config, worker_id, execution_id))
    container = cli.create_container(image=image,
                                     environment=d,
                                     user=user,
                                     volumes=volumes,
                                     host_config=host_config)
    # get the UTC time stamp
    start_time = get_current_utc_time()
    # start the timer to track total execution time.
    start = timeit.default_timer()
    logger.debug("right before cli.start: {}; container id: {}; "
                 "(worker {};{})".format(start, container.get('Id'), worker_id,
                                         execution_id))
    try:
        cli.start(container=container.get('Id'))
    except Exception as e:
        # if there was an error starting the container, user will need to debug
        logger.info(
            "Got exception starting actor container: {}; (worker {};{})".
            format(e, worker_id, execution_id))
        raise DockerStartContainerError(
            "Could not start container {}. Exception {}".format(
                container.get('Id'), str(e)))

    # local bool tracking whether the actor container is still running
    running = True
    Execution.update_status(actor_id, execution_id, RUNNING)

    logger.debug("right before creating stats_cli: {}; (worker {};{})".format(
        timeit.default_timer(), worker_id, execution_id))
    # create a separate cli for checking stats objects since these should be fast and we don't want to wait
    stats_cli = docker.APIClient(base_url=dd, timeout=1, version="auto")
    logger.debug("right after creating stats_cli: {}; (worker {};{})".format(
        timeit.default_timer(), worker_id, execution_id))

    # under load, we can see UnixHTTPConnectionPool ReadTimeout's trying to create the stats_obj
    # so here we are trying up to 3 times to create the stats object for a possible total of 3s
    # timeouts
    ct = 0
    stats_obj = None
    logs = None
    while ct < 3:
        try:
            stats_obj = stats_cli.stats(container=container.get('Id'),
                                        decode=True)
            break
        except ReadTimeout:
            ct += 1
        except Exception as e:
            logger.error(
                "Unexpected exception creating stats_obj. Exception: {}; (worker {};{})"
                .format(e, worker_id, execution_id))
            # in this case, we need to kill the container since we cannot collect stats;
            # UPDATE - 07-2018: under load, a errors can occur attempting to create the stats object.
            # the container could still be running; we need to explicitly check the container status
            # to be sure.
    logger.debug(
        "right after attempting to create stats_obj: {}; (worker {};{})".
        format(timeit.default_timer(), worker_id, execution_id))
    # a counter of the number of iterations through the main "running" loop;
    # this counter is used to determine when less frequent actions, such as log aggregation, need to run.
    loop_idx = 0
    while running and not globals.force_quit:
        loop_idx += 1
        logger.debug(
            "top of while running loop; loop_idx: {}".format(loop_idx))
        datagram = None
        stats = None
        try:
            datagram = server.recv(MAX_RESULT_FRAME_SIZE)
        except socket.timeout:
            pass
        except Exception as e:
            logger.error(
                "got exception from server.recv: {}; (worker {};{})".format(
                    e, worker_id, execution_id))
        logger.debug(
            "right after try/except datagram block: {}; (worker {};{})".format(
                timeit.default_timer(), worker_id, execution_id))
        if datagram:
            try:
                results_ch.put(datagram)
            except Exception as e:
                logger.error(
                    "Error trying to put datagram on results channel. "
                    "Exception: {}; (worker {};{})".format(
                        e, worker_id, execution_id))
        logger.debug("right after results ch.put: {}; (worker {};{})".format(
            timeit.default_timer(), worker_id, execution_id))

        # only try to collect stats if we have a stats_obj:
        if stats_obj:
            logger.debug(
                "we have a stats_obj; trying to collect stats. (worker {};{})".
                format(worker_id, execution_id))
            try:
                logger.debug(
                    "waiting on a stats obj: {}; (worker {};{})".format(
                        timeit.default_timer(), worker_id, execution_id))
                stats = next(stats_obj)
                logger.debug("got the stats obj: {}; (worker {};{})".format(
                    timeit.default_timer(), worker_id, execution_id))
            except StopIteration:
                # we have read the last stats object - no need for processing
                logger.debug(
                    "Got StopIteration; no stats object. (worker {};{})".
                    format(worker_id, execution_id))
            except ReadTimeoutError:
                # this is a ReadTimeoutError from docker, not requests. container is finished.
                logger.info(
                    "next(stats) just timed out: {}; (worker {};{})".format(
                        timeit.default_timer(), worker_id, execution_id))
                # UPDATE - 07-2018: under load, a ReadTimeoutError from the attempt to get a stats object
                # does NOT imply the container has stopped; we need to explicitly check the container status
                # to be sure.

        # if we got a stats object, add it to the results; it is possible stats collection timed out and the object
        # is None
        if stats:
            logger.debug("adding stats to results; (worker {};{})".format(
                worker_id, execution_id))
            try:
                result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage']
            except KeyError as e:
                logger.info(
                    "Got a KeyError trying to fetch the cpu object: {}; "
                    "(worker {};{})".format(e, worker_id, execution_id))
            try:
                result['io'] += stats['networks']['eth0']['rx_bytes']
            except KeyError as e:
                logger.info(
                    "Got KeyError exception trying to grab the io object. "
                    "running: {}; Exception: {}; (worker {};{})".format(
                        running, e, worker_id, execution_id))

        # grab the logs every 5th iteration --
        if loop_idx % 5 == 0:
            logs = cli.logs(container.get('Id'))
            Execution.set_logs(execution_id, logs)
            logs = None

        # checking the container status to see if it is still running ----
        if running:
            logger.debug(
                "about to check container status: {}; (worker {};{})".format(
                    timeit.default_timer(), worker_id, execution_id))
            # we need to wait for the container id to be available
            i = 0
            while i < 10:
                try:
                    c = cli.containers(all=True,
                                       filters={'id': container.get('Id')})[0]
                    break
                except IndexError:
                    logger.error(
                        "Got an IndexError trying to get the container object. "
                        "(worker {};{})".format(worker_id, execution_id))
                    time.sleep(0.1)
                    i += 1
            logger.debug(
                "done checking status: {}; i: {}; (worker {};{})".format(
                    timeit.default_timer(), i, worker_id, execution_id))
            # if we were never able to get the container object, we need to stop processing and kill this
            # worker; the docker daemon could be under heavy load, but we need to not launch another
            # actor container with this worker, because the existing container may still be running,
            if i == 10 or not c:
                # we'll try to stop the container
                logger.error(
                    "Never could retrieve the container object! Attempting to stop container; "
                    "container id: {}; (worker {};{})".format(
                        container.get('Id'), worker_id, execution_id))
                # stop_container could raise an exception - if so, we let it pass up and have the worker
                # shut itself down.
                stop_container(cli, container.get('Id'))
                logger.info("container {} stopped. (worker {};{})".format(
                    container.get('Id'), worker_id, execution_id))

                # if we were able to stop the container, we can set running to False and keep the
                # worker running
                running = False
                continue
            state = c.get('State')
            if not state == 'running':
                logger.debug(
                    "container finished, final state: {}; (worker {};{})".
                    format(state, worker_id, execution_id))
                running = False
                continue
            else:
                # container still running; check if a force_quit has been sent OR
                # we are beyond the max_run_time
                runtime = timeit.default_timer() - start
                if globals.force_quit or (max_run_time > 0
                                          and max_run_time < runtime):
                    logs = cli.logs(container.get('Id'))
                    if globals.force_quit:
                        logger.info(
                            "issuing force quit: {}; (worker {};{})".format(
                                timeit.default_timer(), worker_id,
                                execution_id))
                    else:
                        logger.info(
                            "hit runtime limit: {}; (worker {};{})".format(
                                timeit.default_timer(), worker_id,
                                execution_id))
                    cli.stop(container.get('Id'))
                    running = False
            logger.debug(
                "right after checking container state: {}; (worker {};{})".
                format(timeit.default_timer(), worker_id, execution_id))
    logger.info("container stopped:{}; (worker {};{})".format(
        timeit.default_timer(), worker_id, execution_id))
    stop = timeit.default_timer()
    globals.force_quit = False

    # get info from container execution, including exit code; Exceptions from any of these commands
    # should not cause the worker to shutdown or prevent starting subsequent actor containers.
    try:
        container_info = cli.inspect_container(container.get('Id'))
        try:
            container_state = container_info['State']
            try:
                exit_code = container_state['ExitCode']
            except KeyError as e:
                logger.error("Could not determine ExitCode for container {}. "
                             "Exception: {}; (worker {};{})".format(
                                 container.get('Id'), e, worker_id,
                                 execution_id))
                exit_code = 'undetermined'
        except KeyError as e:
            logger.error(
                "Could not determine final state for container {}. "
                "Exception: {}; (worker {};{})".format(container.get('Id')), e,
                worker_id, execution_id)
            container_state = {'unavailable': True}
    except docker.errors.APIError as e:
        logger.error("Could not inspect container {}. "
                     "Exception: {}; (worker {};{})".format(
                         container.get('Id'), e, worker_id, execution_id))

    logger.debug(
        "right after getting container_info: {}; (worker {};{})".format(
            timeit.default_timer(), worker_id, execution_id))
    # get logs from container
    if not logs:
        logs = cli.logs(container.get('Id'))
    if not logs:
        # there are issues where container do not have logs associated with them when they should.
        logger.info("Container id {} had NO logs associated with it. "
                    "(worker {};{})".format(container.get('Id'), worker_id,
                                            execution_id))
    logger.debug(
        "right after getting container logs: {}; (worker {};{})".format(
            timeit.default_timer(), worker_id, execution_id))

    # get any additional results from the execution:
    while True:
        datagram = None
        try:
            datagram = server.recv(MAX_RESULT_FRAME_SIZE)
        except socket.timeout:
            break
        except Exception as e:
            logger.error(
                "Got exception from server.recv: {}; (worker {};{})".format(
                    e, worker_id, execution_id))
        if datagram:
            try:
                results_ch.put(datagram)
            except Exception as e:
                logger.error(
                    "Error trying to put datagram on results channel. "
                    "Exception: {}; (worker {};{})".format(
                        e, worker_id, execution_id))
    logger.debug(
        "right after getting last execution results from datagram socket: {}; "
        "(worker {};{})".format(timeit.default_timer(), worker_id,
                                execution_id))
    if socket_host_path:
        server.close()
        os.remove(socket_host_path)
    logger.debug("right after removing socket: {}; (worker {};{})".format(
        timeit.default_timer(), worker_id, execution_id))

    # remove actor container with retrying logic -- check for specific filesystem errors from the docker daemon:
    if not leave_container:
        keep_trying = True
        count = 0
        while keep_trying and count < 10:
            keep_trying = False
            count = count + 1
            try:
                cli.remove_container(container=container)
                logger.info("Actor container removed. (worker {};{})".format(
                    worker_id, execution_id))
            except Exception as e:
                # if the container is already gone we definitely want to quit:
                if 'No such container' in str(e):
                    logger.info("Got 'no such container' exception - quiting. "
                                "Exception: {}; (worker {};{})".format(
                                    e, worker_id, execution_id))
                    break
                # if we get a resource busy/internal server error from docker, we need to keep trying to remove the
                # container.
                elif 'device or resource busy' in str(
                        e) or 'failed to remove root filesystem' in str(e):
                    logger.error(
                        "Got resource busy/failed to remove filesystem exception trying to remove "
                        "actor container; will keep trying."
                        "Count: {}; Exception: {}; (worker {};{})".format(
                            count, e, worker_id, execution_id))
                    time.sleep(1)
                    keep_trying = True
                else:
                    logger.error(
                        "Unexpected exception trying to remove actor container. Giving up."
                        "Exception: {}; type: {}; (worker {};{})".format(
                            e, type(e), worker_id, execution_id))
    else:
        logger.debug("leaving actor container since leave_container was True. "
                     "(worker {};{})".format(worker_id, execution_id))
    logger.debug(
        "right after removing actor container: {}; (worker {};{})".format(
            timeit.default_timer(), worker_id, execution_id))

    if fifo_host_path:
        os.close(fifo)
        os.remove(fifo_host_path)
    if results_ch:
        results_ch.close()
    result['runtime'] = int(stop - start)
    logger.debug(
        "right after removing fifo; about to return: {}; (worker {};{})".
        format(timeit.default_timer(), worker_id, execution_id))
    return result, logs, container_state, exit_code, start_time
Exemplo n.º 2
0
def run_worker(image, actor_id, worker_id, client_id, client_access_token,
               client_refresh_token, tenant, api_server, client_secret):
    """
    Run an actor executor worker with a given channel and image.
    :return:
    """
    logger.debug("top of run_worker()")
    command = 'python3 -u /actors/worker.py'
    logger.debug("docker_utils running worker. image:{}, command:{}".format(
        image, command))

    # mount the directory on the host for creating fifos
    try:
        fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir')
        logger.info("Using fifo_host_path_dir: {}".format(fifo_host_path_dir))
    except (configparser.NoSectionError, configparser.NoOptionError) as e:
        logger.error(
            "Got exception trying to look up fifo_host_path_dir. Setting to None. Exception: {}"
            .format(e))
        fifo_host_path_dir = None
    if fifo_host_path_dir:
        mounts = [{
            'host_path': os.path.join(fifo_host_path_dir, worker_id),
            'container_path': os.path.join(fifo_host_path_dir, worker_id),
            'format': 'rw'
        }]
    else:
        mounts = []

    # mount the directory on the host for creating result sockets
    try:
        socket_host_path_dir = Config.get('workers', 'socket_host_path_dir')
        logger.info(
            "Using socket_host_path_dir: {}".format(socket_host_path_dir))
    except (configparser.NoSectionError, configparser.NoOptionError) as e:
        logger.error(
            "Got exception trying to look up fifo_host_path_dir. Setting to None. Exception: {}"
            .format(e))
        socket_host_path_dir = None
    if socket_host_path_dir:
        mounts.append({
            'host_path':
            os.path.join(socket_host_path_dir, worker_id),
            'container_path':
            os.path.join(socket_host_path_dir, worker_id),
            'format':
            'rw'
        })

    logger.info(
        "Final fifo_host_path_dir: {}; socket_host_path_dir: {}".format(
            fifo_host_path_dir, socket_host_path_dir))
    try:
        auto_remove = Config.get('workers', 'auto_remove')
    except (configparser.NoSectionError, configparser.NoOptionError) as e:
        logger.debug("no auto_remove in the workers stanza.")
        auto_remove = True
    if hasattr(auto_remove, 'lower'):
        if auto_remove.lower() == 'false':
            auto_remove = False
        else:
            auto_remove = True
    elif not auto_remove == True:
        auto_remove = False
    container = run_container_with_docker(
        image=AE_IMAGE,
        command=command,
        environment={
            'image': image,
            'worker_id': worker_id,
            '_abaco_secret': os.environ.get('_abaco_secret')
        },
        mounts=mounts,
        log_file=None,
        auto_remove=auto_remove,
        name='worker_{}_{}'.format(actor_id, worker_id),
        client_id=client_id,
        client_access_token=client_access_token,
        client_refresh_token=client_refresh_token,
        actor_id=actor_id,
        tenant=tenant,
        api_server=api_server,
        client_secret=client_secret)
    # don't catch errors -- if we get an error trying to run a worker, let it bubble up.
    # TODO - determines worker structure; should be placed in a proper DAO class.
    logger.info(
        "worker container running. worker_id: {}. container: {}".format(
            worker_id, container))
    return {
        'image': image,
        # @todo - location will need to change to support swarm or cluster
        'location': dd,
        'id': worker_id,
        'cid': container.get('Id'),
        'status': READY,
        'host_id': host_id,
        'host_ip': host_ip,
        'last_execution_time': 0,
        'last_health_check_time': get_current_utc_time()
    }
Exemplo n.º 3
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    host_id = os.environ.get('SPAWNER_HOST_ID',
                             Config.get('spawner', 'host_id'))
    logger.debug("host_id: {}".format(host_id))
    for worker in workers:
        worker_id = worker['id']
        worker_status = worker.get('status')
        # if the worker has only been requested, it will not have a host_id. it is possible
        # the worker will ultimately get scheduled on a different host; however, if there is
        # some issue and the worker is "stuck" in the early phases, we should remove it..
        if 'host_id' not in worker:
            # check for an old create time
            worker_create_t = worker.get('create_time')
            # in versions prior to 1.9, worker create_time was not set until after it was READY
            if not worker_create_t:
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str=
                    'Worker did not have a host_id or create_time field.')
            # if still no host after 5 minutes, delete it
            if worker_create_t < get_current_utc_time() - datetime.timedelta(
                    minutes=5):
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker did not have a host_id and had '
                    'old create_time field.')

        # ignore workers on different hosts because this health agent cannot interact with the
        # docker daemon responsible for the worker container..
        if not host_id == worker['host_id']:
            continue

        # we need to delete any worker that is in SHUTDOWN REQUESTED or SHUTTING down for too long
        if worker_status == codes.SHUTDOWN_REQUESTED or worker_status == codes.SHUTTING_DOWN:
            worker_last_health_check_time = worker.get(
                'last_health_check_time')
            if not worker_last_health_check_time:
                worker_last_health_check_time = worker.get('create_time')
            if not worker_last_health_check_time:
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker in SHUTDOWN and no health checks.')
            elif worker_last_health_check_time < get_current_utc_time(
            ) - datetime.timedelta(minutes=5):
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker in SHUTDOWN for too long.')

        # check if the worker has not responded to a health check recently; we use a relatively long period
        # (60 minutes) of idle health checks in case there is an issue with sending health checks through rabbitmq.
        # this needs to be watched closely though...
        worker_last_health_check_time = worker.get('last_health_check_time')
        if not worker_last_health_check_time or \
                (worker_last_health_check_time < get_current_utc_time() - datetime.timedelta(minutes=60)):
            hard_delete_worker(
                actor_id,
                worker_id,
                reason_str='Worker has not health checked for too long.')

        # first send worker a health check
        logger.info(f"sending worker {worker_id} a health check")
        ch = WorkerChannel(worker_id=worker_id)
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            ch.put('status')
        except (channelpy.exceptions.ChannelTimeoutException, Exception) as e:
            logger.error(
                f"Got exception of type {type(e)} trying to send worker {worker_id} a "
                f"health check. e: {e}")
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))

        # now check if the worker has been idle beyond the max worker_ttl configured for this abaco:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            continue
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = worker.get('last_execution_time', 0)
            # if worker has made zero executions, use the create_time
            if last_execution == 0:
                last_execution = worker.get('create_time',
                                            datetime.datetime.min)
            logger.debug("using last_execution: {}".format(last_execution))
            try:
                assert type(last_execution) == datetime.datetime
            except:
                logger.error(
                    "Time received for TTL measurements is not of type datetime."
                )
                last_execution = datetime.datetime.min
            if last_execution + datetime.timedelta(
                    seconds=ttl) < datetime.datetime.utcnow():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(actor_id, worker['id'])
            else:
                logger.info("Still time left for this worker.")

        if worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(actor_id, worker['id'])
Exemplo n.º 4
0
def execute_actor(actor_id,
                  worker_id,
                  execution_id,
                  image,
                  msg,
                  user=None,
                  d={},
                  privileged=False,
                  mounts=[],
                  leave_container=False,
                  fifo_host_path=None,
                  socket_host_path=None):
    """
    Creates and runs an actor container and supervises the execution, collecting statistics about resource consumption
    from the Docker daemon.

    :param actor_id: the dbid of the actor; for updating worker status
    :param worker_id: the worker id; also for updating worker status
    :param execution_id: the id of the execution.
    :param image: the actor's image; worker must have already downloaded this image to the local docker registry.
    :param msg: the message being passed to the actor.
    :param user: string in the form {uid}:{gid} representing the uid and gid to run the command as.
    :param d: dictionary representing the environment to instantiate within the actor container.
    :param privileged: whether this actor is "privileged"; i.e., its container should run in privileged mode with the
    docker daemon mounted.
    :param mounts: list of dictionaries representing the mounts to add; each dictionary mount should have 3 keys:
    host_path, container_path and format (which should have value 'ro' or 'rw').
    :param fifo_host_path: If not None, a string representing a path on the host to a FIFO used for passing binary data to the actor.
    :param socket_host_path: If not None, a string representing a path on the host to a socket used for collecting results from the actor.
    :return: result (dict), logs (str) - `result`: statistics about resource consumption; `logs`: output from docker logs.
    """
    logger.debug("top of execute_actor()")

    # initial stats object, environment, binds and volumes
    result = {'cpu': 0,
              'io': 0,
              'runtime': 0 }

    # instantiate docker client
    cli = docker.APIClient(base_url=dd, version="auto")

    # don't try to pass binary messages through the environment as these can cause
    # broken pipe errors. the binary data will be passed through the FIFO momentarily.
    if not fifo_host_path:
        d['MSG'] = msg
    binds = {}
    volumes = []

    # if container is privileged, mount the docker daemon so that additional
    # containers can be started.
    logger.debug("privileged: {}".format(privileged))
    if privileged:
        binds = {'/var/run/docker.sock':{
                    'bind': '/var/run/docker.sock',
                    'ro': False }}
        volumes = ['/var/run/docker.sock']

    # add a bind key and dictionary as well as a volume for each mount
    for m in mounts:
        binds[m.get('host_path')] = {'bind': m.get('container_path'),
                                     'ro': m.get('format') == 'ro'}
        volumes.append(m.get('host_path'))
    host_config = cli.create_host_config(binds=binds, privileged=privileged)

    # write binary data to FIFO if it exists:
    if fifo_host_path:
        try:
            fifo = os.open(fifo_host_path, os.O_RDWR)
            os.write(fifo, msg)
        except Exception as e:
            logger.error("Error writing the FIFO. Exception: {}".format(e))
            os.remove(fifo_host_path)
            raise DockerStartContainerError("Error writing to fifo: {}".format(e))

    # set up results socket
    try:
        server = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
        server.bind(socket_host_path)
        server.settimeout(RESULTS_SOCKET_TIMEOUT)
    except Exception as e:
        logger.error("could not instantiate or bind socket. Exception: {}".format(e))
        raise e

    # instantiate the results channel:
    results_ch = ExecutionResultsChannel(actor_id, execution_id)

    # create and start the container
    logger.debug("Final container environment: {}".format(d))
    logger.debug("Final binds: {} and host_config: {} for the container.".format(binds, host_config))
    container = cli.create_container(image=image,
                                     environment=d,
                                     user=user,
                                     volumes=volumes,
                                     host_config=host_config)
    # get the URC time stampe
    start_time = get_current_utc_time()
    # start the timer to track total execution time.
    start = timeit.default_timer()
    logger.debug("right before cli.start: {}".format(start))
    try:
        cli.start(container=container.get('Id'))
    except Exception as e:
        # if there was an error starting the container, user will need to debug
        logger.info("Got exception starting actor container: {}".format(e))
        raise DockerStartContainerError("Could not start container {}. Exception {}".format(container.get('Id'), str(e)))

    # local bool tracking whether the actor container is still running
    running = True

    logger.debug("right before creating stats_cli: {}".format(timeit.default_timer()))
    # create a separate cli for checking stats objects since these should be fast and we don't want to wait
    stats_cli = docker.APIClient(base_url=dd, timeout=1, version="auto")
    logger.debug("right after creating stats_cli: {}".format(timeit.default_timer()))

    # under load, we can see UnixHTTPConnectionPool ReadTimeout's trying to create the stats_obj
    # so here we are trying up to 3 times to create the stats object for a possible total of 3s
    # timeouts
    ct = 0
    while ct < 3:
        try:
            stats_obj = stats_cli.stats(container=container.get('Id'), decode=True)
            break
        except ReadTimeout:
            ct += 1
        except Exception as e:
            logger.error("Unexpected exception creating stats_obj. Exception: {}".format(e))
            # in this case, we need to kill the container since we cannot collect stats;
            running = False
    logger.debug("right after creating stats_obj: {}".format(timeit.default_timer()))

    while running:
        datagram = None
        try:
            datagram = server.recv(MAX_RESULT_FRAME_SIZE)
        except socket.timeout:
            pass
        except Exception as e:
            logger.error("got exception from server.recv: {}".format(e))
        logger.debug("right after try/except datagram block: {}".format(timeit.default_timer()))
        if datagram:
            try:
                results_ch.put(datagram)
            except Exception as e:
                logger.error("Error trying to put datagram on results channel. Exception: {}".format(e))
        logger.debug("right after results ch.put: {}".format(timeit.default_timer()))
        try:
            logger.debug("waiting on a stats obj: {}".format(timeit.default_timer()))
            stats = next(stats_obj)
            logger.debug("got the stats obj: {}".format(timeit.default_timer()))
        except ReadTimeoutError:
            # this is a ReadTimeoutError from docker, not requests. container is finished.
            logger.debug("next(stats) just timed out: {}".format(timeit.default_timer()))
            # container stopped before another stats record could be read, just ignore and move on
            running = False
            break
        try:
            result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage']
        except KeyError as e:
            logger.info("Got a KeyError trying to fetch the cpu object: {}".format(e))
        try:
            result['io'] += stats['networks']['eth0']['rx_bytes']
        except KeyError as e:
            logger.info("Got KeyError exception trying to grab the io object. running: {}; Exception: {}".format(running, e))
        if running:
            logger.debug("about to check container status: {}".format(timeit.default_timer()))
            # we need to wait for the container id to be available
            i = 0
            while i < 10:
                try:
                    c = cli.containers(all=True, filters={'id': container.get('Id')})[0]
                    break
                except IndexError:
                    logger.error("Got an IndexError trying to get the container object.")
                    time.sleep(0.1)
                    i += 1
            logger.debug("done checking status: {}; i: {}".format(timeit.default_timer(), i))
            if i == 10:
                logger.error("Never could retrieve the container object! container id: {}".format(container.get('Id')))
                try:
                    cli.stop(container.get('Id'))
                except Exception as e:
                    logger.error("Got another exception trying to stop the actor container. Exception: {}".format(e))
                finally:
                    running = False
                continue
            state = c.get('State')
            if not state == 'running':
                logger.debug("container finished, final state: {}".format(state))
                running = False
            else:
                # container still running; check if we are beyond the max_run_time
                runtime = timeit.default_timer() - start
                if max_run_time > 0 and max_run_time < runtime:
                    logger.info("hit runtime limit: {}".format(timeit.default_timer()))
                    cli.stop(container.get('Id'))
                    running = False
            logger.debug("right after checking container state: {}".format(timeit.default_timer()))
    logger.info("container stopped:{}".format(timeit.default_timer()))
    stop = timeit.default_timer()
    # get info from container execution, including exit code
    try:
        container_info = cli.inspect_container(container.get('Id'))
        try:
            container_state = container_info['State']
            try:
                exit_code = container_state['ExitCode']
            except KeyError as e:
                logger.error("Could not determine ExitCode for container {}. e: {}".format(container.get('Id'), e))
                exit_code = 'undetermined'
        except KeyError as e:
            logger.error("Could not determine final state for container {}. e: {} ".format(container.get('Id')), e)
            container_state = {'unavailable': True}
    except docker.errors.APIError as e:
        logger.error("Could not inspect container {}. e: {}".format(container.get('Id'), e))

    logger.debug("right after getting container_info: {}".format(timeit.default_timer()))
    # get logs from container
    logs = cli.logs(container.get('Id'))
    logger.debug("right after getting container logs: {}".format(timeit.default_timer()))

    # get any additional results from the execution:
    while True:
        datagram = None
        try:
            datagram = server.recv(MAX_RESULT_FRAME_SIZE)
        except socket.timeout:
            break
        except Exception as e:
            logger.error("Got exception from server.recv: {}".format(e))
        if datagram:
            try:
                results_ch.put(datagram)
            except Exception as e:
                logger.error("Error trying to put datagram on results channel. Exception: {}".format(e))
    logger.debug("right after getting last execution results from datagram socket: {}".format(timeit.default_timer()))
    if socket_host_path:
        server.close()
        os.remove(socket_host_path)
    logger.debug("right after removing socket: {}".format(timeit.default_timer()))

    # remove container, ignore errors
    if not leave_container:
        try:
            cli.remove_container(container=container)
            logger.info("Container removed.")
        except Exception as e:
            logger.error("Exception trying to remove actor: {}".format(e))
    else:
        logger.debug("leaving actor container since leave_container was True.")
    logger.debug("right after removing actor container: {}".format(timeit.default_timer()))

    if fifo_host_path:
        os.close(fifo)
        os.remove(fifo_host_path)
    result['runtime'] = int(stop - start)
    logger.debug("right after removing fifo; about to return: {}".format(timeit.default_timer()))
    return result, logs, container_state, exit_code, start_time
Exemplo n.º 5
0
    def put(self, actor_id):
        logger.debug("top of PUT /actors/{}".format(actor_id))
        dbid = Actor.get_dbid(g.tenant, actor_id)
        try:
            actor = Actor.from_db(actors_store[dbid])
        except KeyError:
            logger.debug("did not find actor {} in store.".format(dbid))
            raise ResourceError(
                "No actor found with id: {}.".format(actor_id), 404)
        previous_image = actor.image
        previous_status = actor.status
        previous_owner = actor.owner
        args = self.validate_put(actor)
        logger.debug("PUT args validated successfully.")
        args['tenant'] = g.tenant
        # user can force an update by setting the force param:
        update_image = args.get('force')
        if not update_image and args['image'] == previous_image:
            logger.debug("new image is the same and force was false. not updating actor.")
            logger.debug("Setting status to the actor's previous status which is: {}".format(previous_status))
            args['status'] = previous_status
        else:
            update_image = True
            args['status'] = SUBMITTED
            logger.debug("new image is different. updating actor.")
        args['api_server'] = g.api_server

        # we do not allow a PUT to override the owner in case the PUT is issued by another user
        args['owner'] = previous_owner
        use_container_uid = args.get('use_container_uid')
        if Config.get('web', 'case') == 'camel':
            use_container_uid = args.get('useContainerUid')
        try:
            use_tas = Config.get('workers', 'use_tas_uid')
        except configparser.NoOptionError:
            logger.debug("no use_tas_uid config.")
            use_tas = False
        if hasattr(use_tas, 'lower'):
            use_tas = use_tas.lower() == 'true'
        else:
            logger.error("use_tas_uid configured but not as a string. use_tas_uid: {}".format(use_tas))
        logger.debug("use_tas={}. user_container_uid={}".format(use_tas, use_container_uid))
        if use_tas and not use_container_uid:
            uid, gid, tasdir = get_tas_data(g.user, g.tenant)
            if uid and gid:
                args['uid'] = uid
                args['gid'] = gid
            if tasdir:
                args['tasdir'] = tasdir
        args['mounts'] = get_all_mounts(args)
        args['last_update_time'] = get_current_utc_time()
        logger.debug("update args: {}".format(args))
        actor = Actor(**args)
        actors_store[actor.db_id] = actor.to_db()
        logger.info("updated actor {} stored in db.".format(actor_id))
        if update_image:
            worker_ids = [Worker.request_worker(tenant=g.tenant, actor_id=actor.db_id)]
            ch = CommandChannel()
            ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant'])
            ch.close()
            logger.debug("put new command on command channel to update actor.")
        # put could have been issued by a user with
        if not previous_owner == g.user:
            set_permission(g.user, actor.db_id, UPDATE)
        return ok(result=actor.display(),
                  msg="Actor updated successfully.")