def execute_actor(actor_id, worker_id, execution_id, image, msg, user=None, d={}, privileged=False, mounts=[], leave_container=False, fifo_host_path=None, socket_host_path=None, mem_limit=None, max_cpus=None): """ Creates and runs an actor container and supervises the execution, collecting statistics about resource consumption from the Docker daemon. :param actor_id: the dbid of the actor; for updating worker status :param worker_id: the worker id; also for updating worker status :param execution_id: the id of the execution. :param image: the actor's image; worker must have already downloaded this image to the local docker registry. :param msg: the message being passed to the actor. :param user: string in the form {uid}:{gid} representing the uid and gid to run the command as. :param d: dictionary representing the environment to instantiate within the actor container. :param privileged: whether this actor is "privileged"; i.e., its container should run in privileged mode with the docker daemon mounted. :param mounts: list of dictionaries representing the mounts to add; each dictionary mount should have 3 keys: host_path, container_path and format (which should have value 'ro' or 'rw'). :param fifo_host_path: If not None, a string representing a path on the host to a FIFO used for passing binary data to the actor. :param socket_host_path: If not None, a string representing a path on the host to a socket used for collecting results from the actor. :param mem_limit: The maximum amount of memory the Actor container can use; should be the same format as the --memory Docker flag. :param max_cpus: The maximum number of CPUs each actor will have available to them. Does not guarantee these CPU resources; serves as upper bound. :return: result (dict), logs (str) - `result`: statistics about resource consumption; `logs`: output from docker logs. """ logger.debug("top of execute_actor(); (worker {};{})".format( worker_id, execution_id)) # initially set the global force_quit variable to False globals.force_quit = False # initial stats object, environment, binds and volumes result = {'cpu': 0, 'io': 0, 'runtime': 0} # instantiate docker client cli = docker.APIClient(base_url=dd, version="auto") # don't try to pass binary messages through the environment as these can cause # broken pipe errors. the binary data will be passed through the FIFO momentarily. if not fifo_host_path: d['MSG'] = msg binds = {} volumes = [] # if container is privileged, mount the docker daemon so that additional # containers can be started. logger.debug("privileged: {};(worker {};{})".format( privileged, worker_id, execution_id)) if privileged: binds = { '/var/run/docker.sock': { 'bind': '/var/run/docker.sock', 'ro': False } } volumes = ['/var/run/docker.sock'] # add a bind key and dictionary as well as a volume for each mount for m in mounts: binds[m.get('host_path')] = { 'bind': m.get('container_path'), 'ro': m.get('format') == 'ro' } volumes.append(m.get('host_path')) # mem_limit # -1 => unlimited memory if mem_limit == '-1': mem_limit = None # max_cpus try: max_cpus = int(max_cpus) except: max_cpus = None # -1 => unlimited cpus if max_cpus == -1: max_cpus = None host_config = cli.create_host_config(binds=binds, privileged=privileged, mem_limit=mem_limit, nano_cpus=max_cpus) logger.debug("host_config object created by (worker {};{}).".format( worker_id, execution_id)) # write binary data to FIFO if it exists: if fifo_host_path: try: fifo = os.open(fifo_host_path, os.O_RDWR) os.write(fifo, msg) except Exception as e: logger.error( "Error writing the FIFO. Exception: {};(worker {};{})".format( e, worker_id, execution_id)) os.remove(fifo_host_path) raise DockerStartContainerError("Error writing to fifo: {}; " "(worker {};{})".format( e, worker_id, execution_id)) # set up results socket ----------------------- # make sure socket doesn't already exist: try: os.unlink(socket_host_path) except OSError as e: if os.path.exists(socket_host_path): logger.error( "socket at {} already exists; Exception: {}; (worker {};{})". format(socket_host_path, e, worker_id, execution_id)) raise DockerStartContainerError( "Got an OSError trying to create the results docket; " "exception: {}".format(e)) # use retry logic since, when the compute node is under load, we see errors initially trying to create the socket # server object. keep_trying = True count = 0 server = None while keep_trying and count < 10: keep_trying = False count = count + 1 try: server = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) except Exception as e: keep_trying = True logger.info("Could not instantiate socket at {}. " "Count: {}; Will keep trying. " "Exception: {}; type: {}; (worker {};{})".format( socket_host_path, count, e, type(e), worker_id, execution_id)) try: server.bind(socket_host_path) except Exception as e: keep_trying = True logger.info("Could not bind socket at {}. " "Count: {}; Will keep trying. " "Exception: {}; type: {}; (worker {};{})".format( socket_host_path, count, e, type(e), worker_id, execution_id)) try: os.chmod(socket_host_path, 0o777) logger.debug( "results socket permissions set to 777. socket_host_path: {}". format(socket_host_path)) except Exception as e: msg = f"Got exception trying to set permissions on the results socket. Not sure what to do. e: {e}" logger.error(msg) # for now, we'll just swallow it but this is really a TODO. try: server.settimeout(RESULTS_SOCKET_TIMEOUT) except Exception as e: keep_trying = True logger.info("Could not set timeout for socket at {}. " "Count: {}; Will keep trying. " "Exception: {}; type: {}; (worker {};{})".format( socket_host_path, count, e, type(e), worker_id, execution_id)) if not server: msg = "Failed to instantiate results socket. " \ "Abaco compute host could be overloaded. Exception: {}; (worker {};{})".format(e, worker_id, execution_id) logger.error(msg) raise DockerStartContainerError(msg) logger.debug( "results socket server instantiated. path: {} (worker {};{})".format( socket_host_path, worker_id, execution_id)) # instantiate the results channel: results_ch = ExecutionResultsChannel(actor_id, execution_id) # create and start the container logger.debug("Final container environment: {};(worker {};{})".format( d, worker_id, execution_id)) logger.debug( "Final binds: {} and host_config: {} for the container.(worker {};{})". format(binds, host_config, worker_id, execution_id)) container = cli.create_container(image=image, environment=d, user=user, volumes=volumes, host_config=host_config) # get the UTC time stamp start_time = get_current_utc_time() # start the timer to track total execution time. start = timeit.default_timer() logger.debug("right before cli.start: {}; container id: {}; " "(worker {};{})".format(start, container.get('Id'), worker_id, execution_id)) try: cli.start(container=container.get('Id')) except Exception as e: # if there was an error starting the container, user will need to debug logger.info( "Got exception starting actor container: {}; (worker {};{})". format(e, worker_id, execution_id)) raise DockerStartContainerError( "Could not start container {}. Exception {}".format( container.get('Id'), str(e))) # local bool tracking whether the actor container is still running running = True Execution.update_status(actor_id, execution_id, RUNNING) logger.debug("right before creating stats_cli: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # create a separate cli for checking stats objects since these should be fast and we don't want to wait stats_cli = docker.APIClient(base_url=dd, timeout=1, version="auto") logger.debug("right after creating stats_cli: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # under load, we can see UnixHTTPConnectionPool ReadTimeout's trying to create the stats_obj # so here we are trying up to 3 times to create the stats object for a possible total of 3s # timeouts ct = 0 stats_obj = None logs = None while ct < 3: try: stats_obj = stats_cli.stats(container=container.get('Id'), decode=True) break except ReadTimeout: ct += 1 except Exception as e: logger.error( "Unexpected exception creating stats_obj. Exception: {}; (worker {};{})" .format(e, worker_id, execution_id)) # in this case, we need to kill the container since we cannot collect stats; # UPDATE - 07-2018: under load, a errors can occur attempting to create the stats object. # the container could still be running; we need to explicitly check the container status # to be sure. logger.debug( "right after attempting to create stats_obj: {}; (worker {};{})". format(timeit.default_timer(), worker_id, execution_id)) # a counter of the number of iterations through the main "running" loop; # this counter is used to determine when less frequent actions, such as log aggregation, need to run. loop_idx = 0 while running and not globals.force_quit: loop_idx += 1 logger.debug( "top of while running loop; loop_idx: {}".format(loop_idx)) datagram = None stats = None try: datagram = server.recv(MAX_RESULT_FRAME_SIZE) except socket.timeout: pass except Exception as e: logger.error( "got exception from server.recv: {}; (worker {};{})".format( e, worker_id, execution_id)) logger.debug( "right after try/except datagram block: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) if datagram: try: results_ch.put(datagram) except Exception as e: logger.error( "Error trying to put datagram on results channel. " "Exception: {}; (worker {};{})".format( e, worker_id, execution_id)) logger.debug("right after results ch.put: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # only try to collect stats if we have a stats_obj: if stats_obj: logger.debug( "we have a stats_obj; trying to collect stats. (worker {};{})". format(worker_id, execution_id)) try: logger.debug( "waiting on a stats obj: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) stats = next(stats_obj) logger.debug("got the stats obj: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) except StopIteration: # we have read the last stats object - no need for processing logger.debug( "Got StopIteration; no stats object. (worker {};{})". format(worker_id, execution_id)) except ReadTimeoutError: # this is a ReadTimeoutError from docker, not requests. container is finished. logger.info( "next(stats) just timed out: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # UPDATE - 07-2018: under load, a ReadTimeoutError from the attempt to get a stats object # does NOT imply the container has stopped; we need to explicitly check the container status # to be sure. # if we got a stats object, add it to the results; it is possible stats collection timed out and the object # is None if stats: logger.debug("adding stats to results; (worker {};{})".format( worker_id, execution_id)) try: result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage'] except KeyError as e: logger.info( "Got a KeyError trying to fetch the cpu object: {}; " "(worker {};{})".format(e, worker_id, execution_id)) try: result['io'] += stats['networks']['eth0']['rx_bytes'] except KeyError as e: logger.info( "Got KeyError exception trying to grab the io object. " "running: {}; Exception: {}; (worker {};{})".format( running, e, worker_id, execution_id)) # grab the logs every 5th iteration -- if loop_idx % 5 == 0: logs = cli.logs(container.get('Id')) Execution.set_logs(execution_id, logs) logs = None # checking the container status to see if it is still running ---- if running: logger.debug( "about to check container status: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # we need to wait for the container id to be available i = 0 while i < 10: try: c = cli.containers(all=True, filters={'id': container.get('Id')})[0] break except IndexError: logger.error( "Got an IndexError trying to get the container object. " "(worker {};{})".format(worker_id, execution_id)) time.sleep(0.1) i += 1 logger.debug( "done checking status: {}; i: {}; (worker {};{})".format( timeit.default_timer(), i, worker_id, execution_id)) # if we were never able to get the container object, we need to stop processing and kill this # worker; the docker daemon could be under heavy load, but we need to not launch another # actor container with this worker, because the existing container may still be running, if i == 10 or not c: # we'll try to stop the container logger.error( "Never could retrieve the container object! Attempting to stop container; " "container id: {}; (worker {};{})".format( container.get('Id'), worker_id, execution_id)) # stop_container could raise an exception - if so, we let it pass up and have the worker # shut itself down. stop_container(cli, container.get('Id')) logger.info("container {} stopped. (worker {};{})".format( container.get('Id'), worker_id, execution_id)) # if we were able to stop the container, we can set running to False and keep the # worker running running = False continue state = c.get('State') if not state == 'running': logger.debug( "container finished, final state: {}; (worker {};{})". format(state, worker_id, execution_id)) running = False continue else: # container still running; check if a force_quit has been sent OR # we are beyond the max_run_time runtime = timeit.default_timer() - start if globals.force_quit or (max_run_time > 0 and max_run_time < runtime): logs = cli.logs(container.get('Id')) if globals.force_quit: logger.info( "issuing force quit: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) else: logger.info( "hit runtime limit: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) cli.stop(container.get('Id')) running = False logger.debug( "right after checking container state: {}; (worker {};{})". format(timeit.default_timer(), worker_id, execution_id)) logger.info("container stopped:{}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) stop = timeit.default_timer() globals.force_quit = False # get info from container execution, including exit code; Exceptions from any of these commands # should not cause the worker to shutdown or prevent starting subsequent actor containers. try: container_info = cli.inspect_container(container.get('Id')) try: container_state = container_info['State'] try: exit_code = container_state['ExitCode'] except KeyError as e: logger.error("Could not determine ExitCode for container {}. " "Exception: {}; (worker {};{})".format( container.get('Id'), e, worker_id, execution_id)) exit_code = 'undetermined' except KeyError as e: logger.error( "Could not determine final state for container {}. " "Exception: {}; (worker {};{})".format(container.get('Id')), e, worker_id, execution_id) container_state = {'unavailable': True} except docker.errors.APIError as e: logger.error("Could not inspect container {}. " "Exception: {}; (worker {};{})".format( container.get('Id'), e, worker_id, execution_id)) logger.debug( "right after getting container_info: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # get logs from container if not logs: logs = cli.logs(container.get('Id')) if not logs: # there are issues where container do not have logs associated with them when they should. logger.info("Container id {} had NO logs associated with it. " "(worker {};{})".format(container.get('Id'), worker_id, execution_id)) logger.debug( "right after getting container logs: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # get any additional results from the execution: while True: datagram = None try: datagram = server.recv(MAX_RESULT_FRAME_SIZE) except socket.timeout: break except Exception as e: logger.error( "Got exception from server.recv: {}; (worker {};{})".format( e, worker_id, execution_id)) if datagram: try: results_ch.put(datagram) except Exception as e: logger.error( "Error trying to put datagram on results channel. " "Exception: {}; (worker {};{})".format( e, worker_id, execution_id)) logger.debug( "right after getting last execution results from datagram socket: {}; " "(worker {};{})".format(timeit.default_timer(), worker_id, execution_id)) if socket_host_path: server.close() os.remove(socket_host_path) logger.debug("right after removing socket: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # remove actor container with retrying logic -- check for specific filesystem errors from the docker daemon: if not leave_container: keep_trying = True count = 0 while keep_trying and count < 10: keep_trying = False count = count + 1 try: cli.remove_container(container=container) logger.info("Actor container removed. (worker {};{})".format( worker_id, execution_id)) except Exception as e: # if the container is already gone we definitely want to quit: if 'No such container' in str(e): logger.info("Got 'no such container' exception - quiting. " "Exception: {}; (worker {};{})".format( e, worker_id, execution_id)) break # if we get a resource busy/internal server error from docker, we need to keep trying to remove the # container. elif 'device or resource busy' in str( e) or 'failed to remove root filesystem' in str(e): logger.error( "Got resource busy/failed to remove filesystem exception trying to remove " "actor container; will keep trying." "Count: {}; Exception: {}; (worker {};{})".format( count, e, worker_id, execution_id)) time.sleep(1) keep_trying = True else: logger.error( "Unexpected exception trying to remove actor container. Giving up." "Exception: {}; type: {}; (worker {};{})".format( e, type(e), worker_id, execution_id)) else: logger.debug("leaving actor container since leave_container was True. " "(worker {};{})".format(worker_id, execution_id)) logger.debug( "right after removing actor container: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) if fifo_host_path: os.close(fifo) os.remove(fifo_host_path) if results_ch: results_ch.close() result['runtime'] = int(stop - start) logger.debug( "right after removing fifo; about to return: {}; (worker {};{})". format(timeit.default_timer(), worker_id, execution_id)) return result, logs, container_state, exit_code, start_time
def run_worker(image, actor_id, worker_id, client_id, client_access_token, client_refresh_token, tenant, api_server, client_secret): """ Run an actor executor worker with a given channel and image. :return: """ logger.debug("top of run_worker()") command = 'python3 -u /actors/worker.py' logger.debug("docker_utils running worker. image:{}, command:{}".format( image, command)) # mount the directory on the host for creating fifos try: fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir') logger.info("Using fifo_host_path_dir: {}".format(fifo_host_path_dir)) except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error( "Got exception trying to look up fifo_host_path_dir. Setting to None. Exception: {}" .format(e)) fifo_host_path_dir = None if fifo_host_path_dir: mounts = [{ 'host_path': os.path.join(fifo_host_path_dir, worker_id), 'container_path': os.path.join(fifo_host_path_dir, worker_id), 'format': 'rw' }] else: mounts = [] # mount the directory on the host for creating result sockets try: socket_host_path_dir = Config.get('workers', 'socket_host_path_dir') logger.info( "Using socket_host_path_dir: {}".format(socket_host_path_dir)) except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error( "Got exception trying to look up fifo_host_path_dir. Setting to None. Exception: {}" .format(e)) socket_host_path_dir = None if socket_host_path_dir: mounts.append({ 'host_path': os.path.join(socket_host_path_dir, worker_id), 'container_path': os.path.join(socket_host_path_dir, worker_id), 'format': 'rw' }) logger.info( "Final fifo_host_path_dir: {}; socket_host_path_dir: {}".format( fifo_host_path_dir, socket_host_path_dir)) try: auto_remove = Config.get('workers', 'auto_remove') except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.debug("no auto_remove in the workers stanza.") auto_remove = True if hasattr(auto_remove, 'lower'): if auto_remove.lower() == 'false': auto_remove = False else: auto_remove = True elif not auto_remove == True: auto_remove = False container = run_container_with_docker( image=AE_IMAGE, command=command, environment={ 'image': image, 'worker_id': worker_id, '_abaco_secret': os.environ.get('_abaco_secret') }, mounts=mounts, log_file=None, auto_remove=auto_remove, name='worker_{}_{}'.format(actor_id, worker_id), client_id=client_id, client_access_token=client_access_token, client_refresh_token=client_refresh_token, actor_id=actor_id, tenant=tenant, api_server=api_server, client_secret=client_secret) # don't catch errors -- if we get an error trying to run a worker, let it bubble up. # TODO - determines worker structure; should be placed in a proper DAO class. logger.info( "worker container running. worker_id: {}. container: {}".format( worker_id, container)) return { 'image': image, # @todo - location will need to change to support swarm or cluster 'location': dd, 'id': worker_id, 'cid': container.get('Id'), 'status': READY, 'host_id': host_id, 'host_ip': host_ip, 'last_execution_time': 0, 'last_health_check_time': get_current_utc_time() }
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" logger.info("Checking health for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception trying to retrieve workers: {}".format(e)) return None logger.debug("workers: {}".format(workers)) host_id = os.environ.get('SPAWNER_HOST_ID', Config.get('spawner', 'host_id')) logger.debug("host_id: {}".format(host_id)) for worker in workers: worker_id = worker['id'] worker_status = worker.get('status') # if the worker has only been requested, it will not have a host_id. it is possible # the worker will ultimately get scheduled on a different host; however, if there is # some issue and the worker is "stuck" in the early phases, we should remove it.. if 'host_id' not in worker: # check for an old create time worker_create_t = worker.get('create_time') # in versions prior to 1.9, worker create_time was not set until after it was READY if not worker_create_t: hard_delete_worker( actor_id, worker_id, reason_str= 'Worker did not have a host_id or create_time field.') # if still no host after 5 minutes, delete it if worker_create_t < get_current_utc_time() - datetime.timedelta( minutes=5): hard_delete_worker( actor_id, worker_id, reason_str='Worker did not have a host_id and had ' 'old create_time field.') # ignore workers on different hosts because this health agent cannot interact with the # docker daemon responsible for the worker container.. if not host_id == worker['host_id']: continue # we need to delete any worker that is in SHUTDOWN REQUESTED or SHUTTING down for too long if worker_status == codes.SHUTDOWN_REQUESTED or worker_status == codes.SHUTTING_DOWN: worker_last_health_check_time = worker.get( 'last_health_check_time') if not worker_last_health_check_time: worker_last_health_check_time = worker.get('create_time') if not worker_last_health_check_time: hard_delete_worker( actor_id, worker_id, reason_str='Worker in SHUTDOWN and no health checks.') elif worker_last_health_check_time < get_current_utc_time( ) - datetime.timedelta(minutes=5): hard_delete_worker( actor_id, worker_id, reason_str='Worker in SHUTDOWN for too long.') # check if the worker has not responded to a health check recently; we use a relatively long period # (60 minutes) of idle health checks in case there is an issue with sending health checks through rabbitmq. # this needs to be watched closely though... worker_last_health_check_time = worker.get('last_health_check_time') if not worker_last_health_check_time or \ (worker_last_health_check_time < get_current_utc_time() - datetime.timedelta(minutes=60)): hard_delete_worker( actor_id, worker_id, reason_str='Worker has not health checked for too long.') # first send worker a health check logger.info(f"sending worker {worker_id} a health check") ch = WorkerChannel(worker_id=worker_id) try: logger.debug("Issuing status check to channel: {}".format( worker['ch_name'])) ch.put('status') except (channelpy.exceptions.ChannelTimeoutException, Exception) as e: logger.error( f"Got exception of type {type(e)} trying to send worker {worker_id} a " f"health check. e: {e}") finally: try: ch.close() except Exception as e: logger.error( "Got an error trying to close the worker channel for dead worker. Exception: {}" .format(e)) # now check if the worker has been idle beyond the max worker_ttl configured for this abaco: if ttl < 0: # ttl < 0 means infinite life logger.info("Infinite ttl configured; leaving worker") continue # we don't shut down workers that are currently running: if not worker['status'] == codes.BUSY: last_execution = worker.get('last_execution_time', 0) # if worker has made zero executions, use the create_time if last_execution == 0: last_execution = worker.get('create_time', datetime.datetime.min) logger.debug("using last_execution: {}".format(last_execution)) try: assert type(last_execution) == datetime.datetime except: logger.error( "Time received for TTL measurements is not of type datetime." ) last_execution = datetime.datetime.min if last_execution + datetime.timedelta( seconds=ttl) < datetime.datetime.utcnow(): # shutdown worker logger.info("Shutting down worker beyond ttl.") shutdown_worker(actor_id, worker['id']) else: logger.info("Still time left for this worker.") if worker['status'] == codes.ERROR: # shutdown worker logger.info("Shutting down worker in error status.") shutdown_worker(actor_id, worker['id'])
def execute_actor(actor_id, worker_id, execution_id, image, msg, user=None, d={}, privileged=False, mounts=[], leave_container=False, fifo_host_path=None, socket_host_path=None): """ Creates and runs an actor container and supervises the execution, collecting statistics about resource consumption from the Docker daemon. :param actor_id: the dbid of the actor; for updating worker status :param worker_id: the worker id; also for updating worker status :param execution_id: the id of the execution. :param image: the actor's image; worker must have already downloaded this image to the local docker registry. :param msg: the message being passed to the actor. :param user: string in the form {uid}:{gid} representing the uid and gid to run the command as. :param d: dictionary representing the environment to instantiate within the actor container. :param privileged: whether this actor is "privileged"; i.e., its container should run in privileged mode with the docker daemon mounted. :param mounts: list of dictionaries representing the mounts to add; each dictionary mount should have 3 keys: host_path, container_path and format (which should have value 'ro' or 'rw'). :param fifo_host_path: If not None, a string representing a path on the host to a FIFO used for passing binary data to the actor. :param socket_host_path: If not None, a string representing a path on the host to a socket used for collecting results from the actor. :return: result (dict), logs (str) - `result`: statistics about resource consumption; `logs`: output from docker logs. """ logger.debug("top of execute_actor()") # initial stats object, environment, binds and volumes result = {'cpu': 0, 'io': 0, 'runtime': 0 } # instantiate docker client cli = docker.APIClient(base_url=dd, version="auto") # don't try to pass binary messages through the environment as these can cause # broken pipe errors. the binary data will be passed through the FIFO momentarily. if not fifo_host_path: d['MSG'] = msg binds = {} volumes = [] # if container is privileged, mount the docker daemon so that additional # containers can be started. logger.debug("privileged: {}".format(privileged)) if privileged: binds = {'/var/run/docker.sock':{ 'bind': '/var/run/docker.sock', 'ro': False }} volumes = ['/var/run/docker.sock'] # add a bind key and dictionary as well as a volume for each mount for m in mounts: binds[m.get('host_path')] = {'bind': m.get('container_path'), 'ro': m.get('format') == 'ro'} volumes.append(m.get('host_path')) host_config = cli.create_host_config(binds=binds, privileged=privileged) # write binary data to FIFO if it exists: if fifo_host_path: try: fifo = os.open(fifo_host_path, os.O_RDWR) os.write(fifo, msg) except Exception as e: logger.error("Error writing the FIFO. Exception: {}".format(e)) os.remove(fifo_host_path) raise DockerStartContainerError("Error writing to fifo: {}".format(e)) # set up results socket try: server = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) server.bind(socket_host_path) server.settimeout(RESULTS_SOCKET_TIMEOUT) except Exception as e: logger.error("could not instantiate or bind socket. Exception: {}".format(e)) raise e # instantiate the results channel: results_ch = ExecutionResultsChannel(actor_id, execution_id) # create and start the container logger.debug("Final container environment: {}".format(d)) logger.debug("Final binds: {} and host_config: {} for the container.".format(binds, host_config)) container = cli.create_container(image=image, environment=d, user=user, volumes=volumes, host_config=host_config) # get the URC time stampe start_time = get_current_utc_time() # start the timer to track total execution time. start = timeit.default_timer() logger.debug("right before cli.start: {}".format(start)) try: cli.start(container=container.get('Id')) except Exception as e: # if there was an error starting the container, user will need to debug logger.info("Got exception starting actor container: {}".format(e)) raise DockerStartContainerError("Could not start container {}. Exception {}".format(container.get('Id'), str(e))) # local bool tracking whether the actor container is still running running = True logger.debug("right before creating stats_cli: {}".format(timeit.default_timer())) # create a separate cli for checking stats objects since these should be fast and we don't want to wait stats_cli = docker.APIClient(base_url=dd, timeout=1, version="auto") logger.debug("right after creating stats_cli: {}".format(timeit.default_timer())) # under load, we can see UnixHTTPConnectionPool ReadTimeout's trying to create the stats_obj # so here we are trying up to 3 times to create the stats object for a possible total of 3s # timeouts ct = 0 while ct < 3: try: stats_obj = stats_cli.stats(container=container.get('Id'), decode=True) break except ReadTimeout: ct += 1 except Exception as e: logger.error("Unexpected exception creating stats_obj. Exception: {}".format(e)) # in this case, we need to kill the container since we cannot collect stats; running = False logger.debug("right after creating stats_obj: {}".format(timeit.default_timer())) while running: datagram = None try: datagram = server.recv(MAX_RESULT_FRAME_SIZE) except socket.timeout: pass except Exception as e: logger.error("got exception from server.recv: {}".format(e)) logger.debug("right after try/except datagram block: {}".format(timeit.default_timer())) if datagram: try: results_ch.put(datagram) except Exception as e: logger.error("Error trying to put datagram on results channel. Exception: {}".format(e)) logger.debug("right after results ch.put: {}".format(timeit.default_timer())) try: logger.debug("waiting on a stats obj: {}".format(timeit.default_timer())) stats = next(stats_obj) logger.debug("got the stats obj: {}".format(timeit.default_timer())) except ReadTimeoutError: # this is a ReadTimeoutError from docker, not requests. container is finished. logger.debug("next(stats) just timed out: {}".format(timeit.default_timer())) # container stopped before another stats record could be read, just ignore and move on running = False break try: result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage'] except KeyError as e: logger.info("Got a KeyError trying to fetch the cpu object: {}".format(e)) try: result['io'] += stats['networks']['eth0']['rx_bytes'] except KeyError as e: logger.info("Got KeyError exception trying to grab the io object. running: {}; Exception: {}".format(running, e)) if running: logger.debug("about to check container status: {}".format(timeit.default_timer())) # we need to wait for the container id to be available i = 0 while i < 10: try: c = cli.containers(all=True, filters={'id': container.get('Id')})[0] break except IndexError: logger.error("Got an IndexError trying to get the container object.") time.sleep(0.1) i += 1 logger.debug("done checking status: {}; i: {}".format(timeit.default_timer(), i)) if i == 10: logger.error("Never could retrieve the container object! container id: {}".format(container.get('Id'))) try: cli.stop(container.get('Id')) except Exception as e: logger.error("Got another exception trying to stop the actor container. Exception: {}".format(e)) finally: running = False continue state = c.get('State') if not state == 'running': logger.debug("container finished, final state: {}".format(state)) running = False else: # container still running; check if we are beyond the max_run_time runtime = timeit.default_timer() - start if max_run_time > 0 and max_run_time < runtime: logger.info("hit runtime limit: {}".format(timeit.default_timer())) cli.stop(container.get('Id')) running = False logger.debug("right after checking container state: {}".format(timeit.default_timer())) logger.info("container stopped:{}".format(timeit.default_timer())) stop = timeit.default_timer() # get info from container execution, including exit code try: container_info = cli.inspect_container(container.get('Id')) try: container_state = container_info['State'] try: exit_code = container_state['ExitCode'] except KeyError as e: logger.error("Could not determine ExitCode for container {}. e: {}".format(container.get('Id'), e)) exit_code = 'undetermined' except KeyError as e: logger.error("Could not determine final state for container {}. e: {} ".format(container.get('Id')), e) container_state = {'unavailable': True} except docker.errors.APIError as e: logger.error("Could not inspect container {}. e: {}".format(container.get('Id'), e)) logger.debug("right after getting container_info: {}".format(timeit.default_timer())) # get logs from container logs = cli.logs(container.get('Id')) logger.debug("right after getting container logs: {}".format(timeit.default_timer())) # get any additional results from the execution: while True: datagram = None try: datagram = server.recv(MAX_RESULT_FRAME_SIZE) except socket.timeout: break except Exception as e: logger.error("Got exception from server.recv: {}".format(e)) if datagram: try: results_ch.put(datagram) except Exception as e: logger.error("Error trying to put datagram on results channel. Exception: {}".format(e)) logger.debug("right after getting last execution results from datagram socket: {}".format(timeit.default_timer())) if socket_host_path: server.close() os.remove(socket_host_path) logger.debug("right after removing socket: {}".format(timeit.default_timer())) # remove container, ignore errors if not leave_container: try: cli.remove_container(container=container) logger.info("Container removed.") except Exception as e: logger.error("Exception trying to remove actor: {}".format(e)) else: logger.debug("leaving actor container since leave_container was True.") logger.debug("right after removing actor container: {}".format(timeit.default_timer())) if fifo_host_path: os.close(fifo) os.remove(fifo_host_path) result['runtime'] = int(stop - start) logger.debug("right after removing fifo; about to return: {}".format(timeit.default_timer())) return result, logs, container_state, exit_code, start_time
def put(self, actor_id): logger.debug("top of PUT /actors/{}".format(actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor {} in store.".format(dbid)) raise ResourceError( "No actor found with id: {}.".format(actor_id), 404) previous_image = actor.image previous_status = actor.status previous_owner = actor.owner args = self.validate_put(actor) logger.debug("PUT args validated successfully.") args['tenant'] = g.tenant # user can force an update by setting the force param: update_image = args.get('force') if not update_image and args['image'] == previous_image: logger.debug("new image is the same and force was false. not updating actor.") logger.debug("Setting status to the actor's previous status which is: {}".format(previous_status)) args['status'] = previous_status else: update_image = True args['status'] = SUBMITTED logger.debug("new image is different. updating actor.") args['api_server'] = g.api_server # we do not allow a PUT to override the owner in case the PUT is issued by another user args['owner'] = previous_owner use_container_uid = args.get('use_container_uid') if Config.get('web', 'case') == 'camel': use_container_uid = args.get('useContainerUid') try: use_tas = Config.get('workers', 'use_tas_uid') except configparser.NoOptionError: logger.debug("no use_tas_uid config.") use_tas = False if hasattr(use_tas, 'lower'): use_tas = use_tas.lower() == 'true' else: logger.error("use_tas_uid configured but not as a string. use_tas_uid: {}".format(use_tas)) logger.debug("use_tas={}. user_container_uid={}".format(use_tas, use_container_uid)) if use_tas and not use_container_uid: uid, gid, tasdir = get_tas_data(g.user, g.tenant) if uid and gid: args['uid'] = uid args['gid'] = gid if tasdir: args['tasdir'] = tasdir args['mounts'] = get_all_mounts(args) args['last_update_time'] = get_current_utc_time() logger.debug("update args: {}".format(args)) actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() logger.info("updated actor {} stored in db.".format(actor_id)) if update_image: worker_ids = [Worker.request_worker(tenant=g.tenant, actor_id=actor.db_id)] ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant']) ch.close() logger.debug("put new command on command channel to update actor.") # put could have been issued by a user with if not previous_owner == g.user: set_permission(g.user, actor.db_id, UPDATE) return ok(result=actor.display(), msg="Actor updated successfully.")