def ip_address(self): """Getter for the service IP address, queries Swarm as the IP address changes outside our control.""" if self.docker_status != self.DOCKER_START_STATUS: return {} swarm = SwarmClient(get_conf()) s_info = swarm.inspect_container(self.docker_id) return s_info['ip_address'][get_conf().overlay_network_name]
def guest_check_thread(args): swarm = SwarmClient(args) while True: try: zoe_containers = swarm.list( {'zoe.deployment_name': get_conf().deployment_name}) for c in zoe_containers: if 'Exited' in c['status']: zoe_id = c['labels']['zoe.service.id'] try: container_died(zoe_id) except ZoeAPIException: log.warning( 'Container ' + c['name'] + ' has died, but Zoe does not know anything about it, deleting' ) swarm.terminate_container(c['id'], delete=True) check_guests(swarm) time.sleep(get_conf().loop_time) except Exception: log.exception('Something bad happened')
def swarm_events_thread(args): swarm = SwarmClient(args) while True: try: swarm.event_listener(main_callback) except Exception: log.exception('Something bad happened')
def main(): """ The entrypoint for the zoe-observer script. :return: int """ load_configuration() args = get_conf() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger('kazoo').setLevel(logging.WARNING) logging.getLogger('requests').setLevel(logging.WARNING) logging.getLogger('docker').setLevel(logging.INFO) swarm = SwarmClient(args) while True: try: zoe_containers = swarm.list('zoe.{}'.format(get_conf().container_name_prefix)) for c in zoe_containers: if 'Exited' in c['status']: zoe_id = c['labels']['zoe.container_id'] container_died(zoe_id) check_guests(swarm) time.sleep(get_conf().loop_time) except KeyboardInterrupt: break except Exception: log.exception('Something bad happened')
def loop(self): assert isinstance(config.singletons['sql_manager'], zoe_lib.sql_manager.SQLManager) while True: message = self.zmq_s.recv_json() self.debug_has_replied = False start_time = time.time() if message['command'] == 'execution_start': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: execution.set_scheduled() self._reply_ok() zoe_master.execution_manager.execution_submit(execution) elif message['command'] == 'execution_terminate': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: execution.set_cleaning_up() self._reply_ok() zoe_master.execution_manager.execution_terminate(execution) elif message['command'] == 'execution_delete': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is not None: zoe_master.execution_manager.execution_delete(execution) self._reply_ok() elif message['command'] == 'service_inspect': service_id = message['service_id'] service = config.singletons['sql_manager'].service_list( id=service_id, only_one=True) if service is None: self._reply_error('no such service') else: swarm = SwarmClient(config.get_conf()) info = swarm.inspect_container(service.docker_id) self._reply_ok(info) else: log.error('Unknown command: {}'.format(message['command'])) self._reply_error('unknown command') if not self.debug_has_replied: self._reply_error('bug') raise ZoeException('BUG: command {} does not fill a reply') config.singletons['metric'].metric_api_call( start_time, message['command'])
def run(self): """The thread loop.""" log.info("Monitor thread started") swarm = SwarmClient(get_conf()) while True: try: swarm.event_listener(lambda x: self._event_cb(x)) except Exception: log.exception('Exception in monitor thread') time.sleep(1) # wait a bit before retrying the connection
def service_logs(self, uid, role, service_id, stream=True): """Retrieve the logs for the given service.""" service = self.sql.service_list(id=service_id, only_one=True) if service is None: raise zoe_api.exceptions.ZoeNotFoundException('No such service') if service.user_id != uid and role != 'admin': raise zoe_api.exceptions.ZoeAuthException() if service.docker_id is None: raise zoe_api.exceptions.ZoeNotFoundException('Container is not running') swarm = SwarmClient(get_conf()) return swarm.logs(service.docker_id, stream)
def terminate_execution(execution: Execution) -> None: execution.set_cleaning_up() swarm = SwarmClient(get_conf()) for s in execution.services: assert isinstance(s, Service) if s.docker_id is not None: s.set_terminating() swarm.terminate_container(s.docker_id, delete=True) s.set_inactive() log.debug('Service {} terminated'.format(s.name)) execution.set_terminated()
def run(self): """The thread loop.""" log.info("Monitor thread started") swarm = SwarmClient(get_conf()) while True: try: swarm.event_listener(lambda x: self._event_cb(x)) except: log.exception('Exception in monitor thread') time.sleep( 1 ) # Usually we got disconnected, so wait a bit before retrying
def terminate_execution(execution: Execution) -> None: """Terminate an execution, making sure no containers are left in Swarm.""" execution.set_cleaning_up() swarm = SwarmClient(get_conf()) for service in execution.services: assert isinstance(service, Service) if service.docker_id is not None: service.set_terminating() swarm.terminate_container(service.docker_id, delete=True) service.set_inactive() log.debug('Service {} terminated'.format(service.name)) execution.set_terminated()
def main(): """The main entrypoint function.""" conf = load_configuration() config.load_configuration(conf) args = config.get_conf() if args.debug: logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) else: logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) logging.getLogger('kazoo').setLevel(logging.WARNING) logging.getLogger('requests').setLevel(logging.WARNING) logging.getLogger('urllib3').setLevel(logging.WARNING) logging.getLogger('docker').setLevel(logging.INFO) logging.getLogger("tornado").setLevel(logging.DEBUG) state = FakeSQLManager() zapp_description = json.load(args.jsonfile) print('Validating zapp description...') zoe_lib.applications.app_validate(zapp_description) exec_id = state.execution_new('test', 'fake_user', zapp_description) e = state.execution_list(only_one=True, id=exec_id) _digest_application_description(state, e) print('Zapp digested, starting containers...') execution_to_containers(e) print('Giving the containers a few seconds to start...') time.sleep(5) swarm = SwarmClient(args) for service in e.services: print("Service {}, docker ID: {}".format(service.name, service.docker_id)) logs = swarm.logs(service.docker_id, False) logs = logs.decode('utf-8').split('\n') for log_line in logs[-10:]: print(log_line) print("Execution as been started, press CTRL-C to terminate it") try: while True: time.sleep(1) except KeyboardInterrupt: pass print('Terminating...') terminate_execution(e)
def loop(self): assert isinstance(config.singletons['sql_manager'], zoe_lib.sql_manager.SQLManager) while True: message = self.zmq_s.recv_json() self.debug_has_replied = False start_time = time.time() if message['command'] == 'execution_start': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format(message['exec_id'])) else: execution.set_scheduled() self._reply_ok() zoe_master.execution_manager.execution_submit(execution) elif message['command'] == 'execution_terminate': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format(message['exec_id'])) else: execution.set_cleaning_up() self._reply_ok() zoe_master.execution_manager.execution_terminate(execution) elif message['command'] == 'execution_delete': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True) if execution is not None: zoe_master.execution_manager.execution_delete(execution) self._reply_ok() elif message['command'] == 'service_inspect': service_id = message['service_id'] service = config.singletons['sql_manager'].service_list(id=service_id, only_one=True) if service is None: self._reply_error('no such service') else: swarm = SwarmClient(config.get_conf()) info = swarm.inspect_container(service.docker_id) self._reply_ok(info) else: log.error('Unknown command: {}'.format(message['command'])) self._reply_error('unknown command') if not self.debug_has_replied: self._reply_error('bug') raise ZoeException('BUG: command {} does not fill a reply') config.singletons['metric'].metric_api_call(start_time, message['command'])
def guest_check_thread(args): swarm = SwarmClient(args) while True: try: zoe_containers = swarm.list({'zoe.deployment_name': get_conf().deployment_name}) for c in zoe_containers: if 'Exited' in c['status']: zoe_id = c['labels']['zoe.service.id'] try: container_died(zoe_id) except ZoeAPIException: log.warning('Container ' + c['name'] + ' has died, but Zoe does not know anything about it, deleting') swarm.terminate_container(c['id'], delete=True) check_guests(swarm) time.sleep(get_conf().loop_time) except Exception: log.exception('Something bad happened')
def run(self): """The thread loop.""" log.info("Checker thread started") swarm = SwarmClient(get_conf()) while not self.stop: service_list = self.state.service_list() container_list = swarm.list(only_label={'zoe.deployment_name': get_conf().deployment_name}) for service in service_list: assert isinstance(service, Service) if service.docker_status == service.DOCKER_DESTROY_STATUS or service.docker_status == service.DOCKER_DIE_STATUS: continue found = False for container in container_list: if container['id'] == service.docker_id: found = True if container['status'] == 'exited': log.info('resetting status of service {}, died with no event'.format(service.name)) service.set_docker_status(service.DOCKER_DIE_STATUS) if not found: service.set_docker_status(service.DOCKER_DESTROY_STATUS) time.sleep(CHECK_INTERVAL)
def save(execution: Execution): """Save the logs of the service specified as argument""" path = _init(execution) if path is None: return for service in execution.services: fname = service.name + '.txt' fpath = os.path.join(path, fname) swarm = SwarmClient(get_conf()) log_gen = swarm.logs(service.docker_id, stream=True, follow=False) if log_gen is None: _shutdown() return try: with open(fpath, 'wb') as out_fp: for line in log_gen: out_fp.write(line) except FileNotFoundError: log.error("Could not create file {}".format(fpath)) _shutdown()
class StatsManager(threading.Thread): def __init__(self): super().__init__(name='stats', daemon=True) self.swarm = SwarmClient(get_conf()) self._swarm_stats = None def run(self): log.info("Stats manager started") while True: try: self._swarm_stats = self.swarm.info() except: log.exception("Exception in stats thread") time.sleep(5) @property def swarm_stats(self): return self._swarm_stats
def _spawn_service(execution: Execution, service: Service, env_subst_dict: dict): copts = DockerContainerOptions() copts.gelf_log_address = get_conf().gelf_address copts.name = service.dns_name copts.set_memory_limit(service.description['required_resources']['memory']) copts.network_name = get_conf().overlay_network_name copts.labels = { 'zoe.execution.name': execution.name, 'zoe.execution.id': str(execution.id), 'zoe.service.name': service.name, 'zoe.service.id': str(service.id), 'zoe.owner': execution.user_id, 'zoe.deployment_name': get_conf().deployment_name, 'zoe.type': 'app_service' } if service.description['monitor']: copts.labels['zoe.monitor'] = 'true' else: copts.labels['zoe.monitor'] = 'false' copts.restart = not service.description[ 'monitor'] # Monitor containers should not restart _gen_environment(service, env_subst_dict, copts) for p in service.description['ports']: if p['expose']: copts.ports.append(p['port_number']) # FIXME UDP ports? if 'volumes' in service.description: for path, mount_point, readonly in service.description['volumes']: copts.add_volume_bind(path, mount_point, readonly) if 'constraints' in service.description: for constraint in service.description['constraints']: copts.add_constraint(constraint) fswk = ZoeFSWorkspace() if fswk.can_be_attached(): copts.add_volume_bind(fswk.get_path(execution.user_id), fswk.get_mountpoint(), False) # The same dictionary is used for templates in the command if 'command' in service.description: copts.set_command( service.description['command'].format(**env_subst_dict)) try: swarm = SwarmClient(get_conf()) except Exception as e: raise ZoeStartExecutionFatalException(str(e)) try: cont_info = swarm.spawn_container(service.description['docker_image'], copts) except ZoeException as e: raise ZoeStartExecutionRetryException(str(e)) except ZoeLibException as e: raise ZoeStartExecutionRetryException(str(e)) service.set_active(cont_info["docker_id"]) if 'networks' in service.description: for net in service.description['networks']: try: swarm.connect_to_network(service.docker_id, net) except ZoeException as e: raise ZoeStartExecutionFatalException(str(e)) return
def __init__(self): super().__init__(name='stats', daemon=True) self.swarm = SwarmClient(get_conf()) self._swarm_stats = None
def __init__(self, sched_policy_class): self.swarm = SwarmClient(get_conf()) self.scheduler = ZoeScheduler(self, sched_policy_class) self.state_manager = None
class PlatformManager: """ :type swarm: SwarmClient :type scheduler: ZoeScheduler :type state_manager: StateManager """ def __init__(self, sched_policy_class): self.swarm = SwarmClient(get_conf()) self.scheduler = ZoeScheduler(self, sched_policy_class) self.state_manager = None def execution_submitted(self, execution: execution_module.Execution): execution.set_scheduled() self.scheduler.incoming(execution) def execution_start(self, execution: execution_module.Execution) -> bool: try: self._application_to_containers(execution) except ZoeException: self.execution_terminate(execution, reason='error') raise execution.set_started() self.state_manager.state_updated() def _application_to_containers(self, execution: execution_module.Execution): for process in execution.application.processes: self._spawn_process(execution, process) def _spawn_process(self, execution: execution_module.Execution, process_description: application_module.Process) -> bool: copts = ContainerOptions() copts.name = get_conf().container_name_prefix + '-' + process_description.name + "-{}".format(execution.id) copts.set_memory_limit(process_description.required_resources['memory']) copts.network_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, execution.owner.id) container_id = self.state_manager.gen_id() copts.labels = { 'zoe.{}'.format(get_conf().container_name_prefix): '', 'zoe.execution.id': str(execution.id), 'zoe.execution.name': execution.name, 'zoe.container.id': str(container_id), 'zoe.container.name': process_description.name, 'zoe.owner': execution.owner.name, 'zoe.prefix': get_conf().container_name_prefix, 'zoe.type': 'app_process' } if process_description.monitor: copts.labels['zoe.monitor'] = '' else: copts.labels['zoe.normal'] = '' copts.restart = not process_description.monitor # Monitor containers should not restart # Generate a dictionary containing the current cluster status (before the new container is spawned) # This information is used to substitute template strings in the environment variables subst_dict = { "execution_id": str(execution.id), "user_id": str(execution.owner.id), 'user_name': execution.owner.name, 'name_prefix': get_conf().container_name_prefix } for env_name, env_value in process_description.environment: try: env_value = env_value.format(**subst_dict) except KeyError: raise ZoeException("cannot find variable to substitute in expression {}".format(env_value)) copts.add_env_variable(env_name, env_value) # The same dictionary is used for templates in the command if process_description.command is not None: copts.set_command(process_description.command.format(**subst_dict)) cont_info = self.swarm.spawn_container(process_description.docker_image, copts) container = container_module.Container(self.state_manager) container.docker_id = cont_info["docker_id"] container.ip_address = cont_info["ip_address"] container.name = copts.name container.is_monitor = process_description.monitor container.ports = [p.to_dict() for p in process_description.ports] container.id = container_id execution.containers.append(container) container.execution = execution self.swarm.connect_to_network(container.docker_id, 'eeef9754c16790a29d5210c5d9ad8e66614ee8a6229b6dc6f779019d46cec792') self.state_manager.new('container', container) return True def execution_terminate(self, execution: execution_module.Execution, reason): """ :param execution: The execution to be terminated :param reason: termination reason :return: """ logs = [] if len(execution.containers) > 0: containers = execution.containers.copy() for c in containers: assert isinstance(c, container_module.Container) l = self.log_get(c.id) if l is not None: logs.append((c.name, l)) self.swarm.terminate_container(c.docker_id, delete=True) self.state_manager.delete('container', c.id) log.info('Container {} terminated'.format(c.name)) execution.store_logs(logs) if reason == 'error': execution.set_error() elif reason == 'finished': execution.set_finished() else: execution.set_terminated() self.scheduler.execution_terminate(execution) def start_gateway_container(self, user): copts = ContainerOptions() copts.name = '{}-gateway-{}'.format(get_conf().container_name_prefix, user.id) copts.network_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, user.id) copts.ports.append(1080) copts.labels = { 'zoe.{}.gateway'.format(get_conf().container_name_prefix): '', 'zoe.owner': user.name, 'zoe.prefix': get_conf().container_name_prefix, 'zoe.type': 'gateway' } copts.restart = True if user.role == 'guest': image = get_conf().private_registry + '/zoerepo/guest-gateway' else: image = get_conf().private_registry + '/zoerepo/guest-gateway' # TODO: create an image with ssh cont_info = self.swarm.spawn_container(image, copts) if cont_info is None: raise ZoeException('Cannot create user gateway container') user.gateway_docker_id = cont_info['docker_id'] user.set_gateway_urls(cont_info) self.swarm.connect_to_network(user.gateway_docker_id, 'eeef9754c16790a29d5210c5d9ad8e66614ee8a6229b6dc6f779019d46cec792') def kill_gateway_container(self, user): self.swarm.terminate_container(user.gateway_docker_id, delete=True) user.gateway_docker_id = None user.gateway_urls = [] def create_user_network(self, user): log.info('Creating a new network for user {}'.format(user.id)) net_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, user.id) net_id = self.swarm.network_create(net_name) user.network_id = net_id def remove_user_network(self, user): log.info('Removing network for user {}'.format(user.name)) self.swarm.network_remove(user.network_id) def log_get(self, container_id: int) -> str: container = self.state_manager.get_one('container', id=container_id) if container is None: return '' else: return self.swarm.log_get(container.docker_id) def container_stats(self, container_id: int) -> ContainerStats: container = self.state_manager.get_one('container', id=container_id) return self.swarm.stats(container.docker_id) def is_container_alive(self, container: container_module.Container) -> bool: ret = self.swarm.inspect_container(container.docker_id) if ret is None: return False return ret["running"] def swarm_stats(self) -> SwarmStats: # TODO implement some caching return self.swarm.info() def scheduler_stats(self) -> SchedulerStats: return self.scheduler.scheduler_policy.stats() def check_state_swarm_consistency(self): state_changed = False users = self.state_manager.get('user') networks = self.swarm.network_list('{}-usernet-'.format(get_conf().container_name_prefix)) gateways = self.swarm.list(['zoe.{}.gateway'.format(get_conf().container_name_prefix)]) users_no_network = [] users_no_gateway = [] networks_to_delete = [] gateways_to_delete = [] for u in users: if u.network_id is None: log.error('state inconsistency: user {} has no network'.format(u.name)) users_no_network.append(u) elif u.network_id not in [x['id'] for x in networks]: log.error('state inconsistency: user {} has an invalid network'.format(u.name)) u.network_id = None users_no_network.append(u) if u.gateway_docker_id is None: log.error('state inconsistency: user {} has no gateway'.format(u.name)) users_no_gateway.append(u) elif u.gateway_docker_id not in [x['id'] for x in gateways]: log.error('state inconsistency: user {} has an invalid gateway container ID'.format(u.name)) u.gateway_docker_id = None users_no_gateway.append(u) duplicate_check = set() for n in networks: try: uid = int(n['name'][len('{}-usernet-'.format(get_conf().container_name_prefix)):]) except ValueError: log.error('network {} does not belong to Zoe, bug?'.format(n['name'])) networks_to_delete.append(n['id']) continue if uid in duplicate_check: log.warning('state inconsistency: found two networks for the same user') networks_to_delete.append(n['id']) continue duplicate_check.add(uid) user = self.state_manager.get_one('user', id=uid) if user is not None and user in users_no_network: user.network_id = n['id'] users_no_network.remove(user) log.error('fixed: user {} linked to network {}'.format(user.name, n['name'])) state_changed = True continue elif user is None: log.error('state inconsistency: found a network for user {} who no longer exists'.format(uid)) networks_to_delete.append(n['id']) for g in gateways: try: uid = int(g['name'][len('{}-gateway-'.format(get_conf().container_name_prefix)):]) except ValueError: log.error('container {} does not belong to Zoe, bug?'.format(g['name'])) gateways_to_delete.append(g['id']) continue user = self.state_manager.get_one('user', id=uid) if user is not None and user in users_no_gateway: user.gateway_docker_id = g['id'] users_no_gateway.remove(user) cont_info = self.swarm.inspect_container(g['id']) user.set_gateway_urls(cont_info) log.error('fixed: user {} linked to gateway {}'.format(user.name, g['name'])) state_changed = True continue elif user is None: log.error('state inconsistency: found a gateway for user {} who no longer exists'.format(uid)) gateways_to_delete.append(g['id']) # Fix all inconsistencies found for g in gateways_to_delete: log.error('fixed: terminating orphan gateway container {}'.format(g[:8])) self.swarm.terminate_container(g, delete=True) for n in networks_to_delete: log.error('fixed: terminating orphan network {}'.format(n[:8])) self.swarm.network_remove(n) for u in users_no_network: log.error('fixed: creating network for user {}'.format(u.name)) self.create_user_network(u) for u in users_no_gateway: log.error('fixed: creating gateway for user {}'.format(u.name)) self.start_gateway_container(u) # ### Check executions and container consistency swarm_containers = self.swarm.list(only_label='zoe.{}'.format(get_conf().container_name_prefix)) conts_state_to_delete = [] for c_id, c in self.state_manager.containers.items(): if c.docker_id not in [x['id'] for x in swarm_containers]: log.error('fixed: removing from state container {} that does not exist in Swarm'.format(c.name)) conts_state_to_delete.append(c_id) for c_id in conts_state_to_delete: self.state_manager.delete('container', c_id) if state_changed or len(users_no_gateway) > 0 or len(users_no_network) > 0: self.state_manager.state_updated()
def _spawn_service(execution: Execution, service: Service, env_subst_dict: dict): copts = DockerContainerOptions() copts.gelf_log_address = get_conf().gelf_address copts.name = service.dns_name copts.set_memory_limit(service.description['required_resources']['memory']) copts.network_name = get_conf().overlay_network_name copts.labels = { 'zoe.execution.name': execution.name, 'zoe.execution.id': str(execution.id), 'zoe.service.name': service.name, 'zoe.service.id': str(service.id), 'zoe.owner': execution.user_id, 'zoe.deployment_name': get_conf().deployment_name, 'zoe.type': 'app_service' } if service.description['monitor']: copts.labels['zoe.monitor'] = 'true' else: copts.labels['zoe.monitor'] = 'false' copts.restart = not service.description['monitor'] # Monitor containers should not restart # Generate a dictionary containing the current cluster status (before the new container is spawned) # This information is used to substitute template strings in the environment variables for env_name, env_value in service.description['environment']: try: env_value = env_value.format(**env_subst_dict) except KeyError: raise ZoeStartExecutionFatalException("unknown variable in expression {}".format(env_value)) copts.add_env_variable(env_name, env_value) for p in service.description['ports']: if p['expose']: copts.ports.append(p['port_number']) # FIXME UDP ports? if 'volumes' in service.description: for path, mount_point, readonly in service.description['volumes']: copts.add_volume_bind(path, mount_point, readonly) for wks in singletons['workspace_managers']: assert isinstance(wks, zoe_master.workspace.base.ZoeWorkspaceBase) if wks.can_be_attached(): copts.add_volume_bind(wks.get_path(execution.user_id), wks.get_mountpoint(), False) # The same dictionary is used for templates in the command if 'command' in service.description: copts.set_command(service.description['command'].format(**env_subst_dict)) try: swarm = SwarmClient(get_conf()) except Exception as e: raise ZoeStartExecutionFatalException(str(e)) try: cont_info = swarm.spawn_container(service.description['docker_image'], copts) except ZoeException as e: raise ZoeStartExecutionRetryException(str(e)) service.set_active(cont_info["docker_id"]) if 'networks' in service.description: for net in service.description['networks']: try: swarm.connect_to_network(service.docker_id, net) except ZoeException as e: raise ZoeStartExecutionFatalException(str(e)) return