def auth(self, username, password): """Authenticate the user or raise an exception.""" search_filter = "uid=" + username try: if self.sasl: self.connection.sasl_interactive_bind_s('', self.sasl_auth) else: self.connection.bind_s(get_conf().ldap_bind_user, get_conf().ldap_bind_password) except ldap.LDAPError: log.error('Unknown LDAP BIND user or wrong password.') raise zoe_api.exceptions.ZoeAuthException( 'Unknown LDAP BIND user or wrong password.') try: result = self.connection.search_s(self.base_dn, ldap.SCOPE_SUBTREE, search_filter) if len(result) == 0: raise zoe_api.exceptions.ZoeAuthException( 'Unknown user or wrong password.') result = self.connection.compare_s( search_filter + ',' + self.base_dn, 'userPassword', password) if result == 0: raise zoe_api.exceptions.ZoeAuthException( 'Unknown user or wrong password.') except ldap.LDAPError as ex: if ex.args[0]['desc'] == 'Invalid credentials': raise zoe_api.exceptions.ZoeAuthException( 'Unknown user or wrong password.') else: log.exception("LDAP exception") zoe_api.exceptions.ZoeAuthException('LDAP error.') finally: self.connection.unbind_s() return True
def handle(self): """Handle one UDP packet (one GELF log line in JSON format).""" data = self.rfile.read() try: data = gzip.decompress(data) except OSError: return data = json.loads(data.decode('utf-8')) deployment_name = data['_zoe_deployment_name'] if deployment_name != get_conf().deployment_name: return execution_id = data['_zoe_execution_id'] service_name = data['_zoe_service_name'] host = data['host'] timestamp = datetime.datetime.utcfromtimestamp( data['timestamp']).strftime('%Y-%m-%d %H:%M:%S') message = data['short_message'] log_file_path = os.path.join(get_conf().service_logs_base_path, get_conf().deployment_name, str(execution_id), service_name + '.txt') if not os.path.exists(log_file_path): os.makedirs(os.path.join(get_conf().service_logs_base_path, get_conf().deployment_name, str(execution_id)), exist_ok=True) open(log_file_path, 'w').write( 'ZOE HEADER: log file for service {} running on host {}\n'. format(service_name, host)) with open(log_file_path, 'a') as logfile: logfile.write(timestamp + ' ' + message + '\n')
def auth(self, username, password): """Authenticate the user or raise an exception.""" search_filter = "uid=" + username uid = None role = 'guest' bind_user = '******' + username + "," + self.base_dn try: self.connection.bind_s(bind_user, password) result = self.connection.search_s(self.base_dn, ldap.SCOPE_SUBTREE, search_filter) if len(result) == 0: raise zoe_api.exceptions.ZoeAuthException('Unknown user or wrong password.') user_dict = result[0][1] uid = username gid_numbers = [int(x) for x in user_dict['gidNumber']] if get_conf().ldap_admin_gid in gid_numbers: role = 'admin' elif get_conf().ldap_user_gid in gid_numbers: role = 'user' elif get_conf().ldap_guest_gid in gid_numbers: role = 'guest' else: log.warning('User {} has an unknown group ID ({}), using guest role'.format(username, result[0][1]['gidNumber'])) role = 'guest' except ldap.LDAPError as ex: if ex.args[0]['desc'] == 'Invalid credentials': raise zoe_api.exceptions.ZoeAuthException('Unknown user or wrong password.') else: log.exception("LDAP exception") zoe_api.exceptions.ZoeAuthException('LDAP error.') finally: self.connection.unbind_s() return uid, role
def zshop_list_apps(role): """List the ZApp repos.""" dirs = [ d for d in os.listdir(get_conf().zapp_shop_path) if os.path.isdir(os.path.join(get_conf().zapp_shop_path, d)) and os.path.exists( os.path.join(get_conf().zapp_shop_path, d, "manifest.json")) ] zapps = [] for adir in dirs: zapps += zshop_read_manifest(adir) zapp_cat = {} for zapp in zapps: if not role.can_access_full_zapp_shop: if role.name in zapp.disabled_for: continue if role.name not in zapp.enabled_for and "all" not in zapp.enabled_for: continue if zapp.category in zapp_cat: zapp_cat[zapp.category].append(zapp) else: zapp_cat[zapp.category] = [zapp] return zapp_cat
def get_auth_login(username, password): """Authenticate username and password against the configured user store.""" # First of all try to authenticate against a fixed list of users in a text file try: authenticator = PlainTextAuthenticator() # type: BaseAuthenticator uid, role = authenticator.auth(username, password) return uid, role except zoe_api.exceptions.ZoeAuthException: pass except zoe_api.exceptions.ZoeNotFoundException: pass # It it fails, continue with the normal authentication if get_conf().auth_type == 'ldap': authenticator = LDAPAuthenticator( sasl=False) # type: BaseAuthenticator elif get_conf().auth_type == 'ldapsasl': authenticator = LDAPAuthenticator(sasl=True) # type: BaseAuthenticator else: raise zoe_api.exceptions.ZoeException( 'Configuration error, unknown authentication method: {}'.format( get_conf().auth_type)) uid, role = authenticator.auth(username, password) if uid is None: raise zoe_api.exceptions.ZoeAuthException return uid, role
def get(self, service_id): """Gather details about an execution.""" if self.current_user is None: return try: service = self.api_endpoint.service_by_id(self.current_user, service_id) except zoe_api.exceptions.ZoeException as e: self.set_status(e.status_code, e.message) return template_vars = { "service": service, "log_path": "{}/{}/{}/{}.txt".format(get_conf().log_url, get_conf().deployment_name, service.execution_id, service.name), "websocket_base": get_conf().websocket_base + get_conf().reverse_proxy_path, 'use_websockets': get_conf().log_use_websockets } self.render('service_logs.jinja2', **template_vars)
def get_auth(handler: tornado.web.RequestHandler): """Try to authenticate a request.""" auth_header = handler.request.headers.get('Authorization') if auth_header is None or not auth_header.startswith('Basic '): raise ZoeRestAPIException( 'missing or wrong authentication information', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) auth_decoded = base64.decodebytes(bytes(auth_header[6:], 'ascii')).decode('utf-8') username, password = auth_decoded.split(':', 2) if get_conf().auth_type == 'text': authenticator = PlainTextAuthenticator() # type: BaseAuthenticator elif get_conf().auth_type == 'ldap': authenticator = LDAPAuthenticator() else: raise ZoeException( 'Configuration error, unknown authentication method: {}'.format( get_conf().auth_type)) uid, role = authenticator.auth(username, password) if uid is None: raise ZoeRestAPIException( 'missing or wrong authentication information', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) return uid, role
def __init__(self, sasl): self.connection = ldap.initialize(get_conf().ldap_server_uri) self.base_dn = get_conf().ldap_base_dn self.sasl = sasl self.connection.protocol_version = ldap.VERSION3 if self.sasl: self.sasl_auth = ldap.sasl.sasl({}, 'GSSAPI')
def proxy_address(self): """Get proxy address path""" if len(self.ports) > 0: return self.name + "-" + str(self.execution_id) + "-" + get_conf( ).deployment_name + "." + get_conf().proxy_path else: return None
def terminate(self, name): """Terminate a service. It will terminate Service, then ReplicationController and Pods have the same labels.""" del_obj = { 'apiVersion': 'v1', 'kind': '', 'metadata': { 'name': name, 'namespace': get_conf().kube_namespace } } try: del_obj['kind'] = 'Service' pykube.Service(self.api, del_obj).delete() del_obj['kind'] = 'ReplicationController' pykube.ReplicationController(self.api, del_obj).delete() del_obj['kind'] = 'Pod' pod_selector = ZOE_LABELS pod_selector['service_name'] = name pods = pykube.Pod.objects( self.api).filter(namespace=get_conf().kube_namespace, selector=pod_selector).iterator() for pod in pods: del_obj['metadata']['name'] = str(pod) pykube.Pod(self.api, del_obj).delete() log.info('Service deleted on Kubernetes cluster') except Exception as ex: log.error(ex)
def gen_environment(execution: Execution, service: Service, env_subst_dict: Dict): """ Generate a dictionary containing the current cluster status (before the new container is spawned) This information is used to substitute template strings in the environment variables.""" env_list = [] for env_name, env_value in service.environment: try: env_value = env_value.format(**env_subst_dict) except KeyError: error_msg = "Unknown variable in environment expression '{}', known variables are: {}".format(env_value, list(env_subst_dict.keys())) service.set_error(error_msg) raise ZoeStartExecutionFatalException("Service {} has wrong environment expression") env_list.append((env_name, env_value)) env_list.append(('EXECUTION_ID', str(execution.id))) env_list.append(('DEPLOY_NAME', get_conf().deployment_name)) env_list.append(('ZOE_UID', execution.owner.fs_uid)) env_list.append(('ZOE_GID', get_conf().fs_group_id)) env_list.append(('ZOE_USER', execution.owner.username)) env_list.append(('SERVICE_NAME', service.name)) if get_conf().traefik_zk_ips is not None: for port in service.ports: env_list.append(('REVERSE_PROXY_PATH_{}'.format(port.internal_number), '{}/{}'.format(get_conf().traefik_base_url, port.proxy_key()))) wk_vol = ZoeFSWorkspace().get(execution.owner) env_list.append(('ZOE_WORKSPACE', wk_vol.mount_point)) return env_list
def ip_address(self): """Getter for the service IP address, queries Swarm as the IP address changes outside our control.""" if self.docker_status != self.DOCKER_START_STATUS: return {} swarm = SwarmClient(get_conf()) s_info = swarm.inspect_container(self.docker_id) return s_info['ip_address'][get_conf().overlay_network_name]
def get(self, zapp_id): """Home page with authentication.""" uid, role = get_auth(self) if uid is None: return self.redirect(self.get_argument('next', u'/login')) manifest_index = int(zapp_id.split('-')[-1]) zapp_id = "-".join(zapp_id.split('-')[:-1]) zapps = zapp_shop.zshop_read_manifest(zapp_id) zapp = zapps[manifest_index] template_vars = { "uid": uid, "role": role, 'zapp': zapp, 'max_core_limit': get_conf().max_core_limit, 'max_memory_limit': get_conf().max_memory_limit, 'resources_are_customizable': role == "admin" or (role != "guest" and (role == "user" and not get_conf().no_user_edit_limits_web)), 'additional_volumes': get_conf().additional_volumes } self.render('zapp_start.html', **template_vars)
def gen_environment(execution: Execution, service: Service, env_subst_dict: Dict): """ Generate a dictionary containing the current cluster status (before the new container is spawned) This information is used to substitute template strings in the environment variables.""" env_list = [] for env_name, env_value in service.environment: try: env_value = env_value.format(**env_subst_dict) except KeyError: error_msg = "Unknown variable in environment expression '{}', known variables are: {}".format( env_value, list(env_subst_dict.keys())) service.set_error(error_msg) raise ZoeStartExecutionFatalException( "Service {} has wrong environment expression") env_list.append((env_name, env_value)) env_list.append(('EXECUTION_ID', str(execution.id))) env_list.append(('DEPLOY_NAME', get_conf().deployment_name)) env_list.append(('UID', execution.user_id)) env_list.append(('SERVICE_NAME', service.name)) env_list.append(('PROXY_PATH', get_conf().proxy_path)) wk_vol = ZoeFSWorkspace().get(execution.user_id) env_list.append(('ZOE_WORKSPACE', wk_vol.mount_point)) return env_list
def _update_node_state(self, host_conf: DockerHostConfig, node_stats: NodeStats, get_usage_stats: bool): node_stats.labels = host_conf.labels try: my_engine = DockerClient(host_conf) except ZoeException as e: log.error(str(e)) node_stats.status = 'offline' log.info('Node {} is offline'.format(host_conf.name)) return else: node_stats.status = 'online' try: container_list = my_engine.list(only_label={'zoe_deployment_name': get_conf().deployment_name}) info = my_engine.info() except ZoeException: return node_stats.container_count = len(container_list) node_stats.cores_total = info['NCPU'] node_stats.memory_total = info['MemTotal'] if info['Labels'] is not None: node_stats.labels += set(info['Labels']) node_stats.memory_reserved = sum([cont['memory_soft_limit'] for cont in container_list if cont['memory_soft_limit'] != node_stats.memory_total]) node_stats.cores_reserved = sum([cont['cpu_quota'] / cont['cpu_period'] for cont in container_list if cont['cpu_period'] != 0]) stats = {} for cont in container_list: stats[cont['id']] = {} stats[cont['id']]['core_limit'] = cont['cpu_quota'] / cont['cpu_period'] stats[cont['id']]['mem_limit'] = cont['memory_soft_limit'] node_stats.service_stats = stats if get_usage_stats: if get_conf().kairosdb_enable: kdb = KairosDBInMetrics() for cont in container_list: stats[cont['id']].update(kdb.get_service_usage(cont['name'])) node_stats.memory_in_use = sum([stat['mem_usage'] for stat in stats.values()]) node_stats.cores_in_use = sum([stat['cpu_usage'] for stat in stats.values()]) else: for cont in container_list: try: aux = my_engine.stats(cont['id'], stream=False) # this call is very slow (>~1sec) if 'usage' in aux['memory_stats']: stats[cont['id']]['mem_usage'] = aux['memory_stats']['usage'] else: stats[cont['id']]['mem_usage'] = 0 stats[cont['id']]['cpu_usage'] = self._get_core_usage(aux) except ZoeException: continue node_stats.memory_in_use = sum([stat['mem_usage'] for stat in stats.values()]) node_stats.cores_in_use = sum([stat['cpu_usage'] for stat in stats.values()]) else: node_stats.memory_in_use = 0 node_stats.cores_in_use = 0
def execution_delete(execution: Execution): """Remove an execution, must only be called if the execution is NOT running.""" assert not execution.is_active path = os.path.join(get_conf().service_logs_base_path, get_conf().deployment_name, str(execution.id)) if path is None: return shutil.rmtree(path, ignore_errors=True)
def main(test_conf=None): """ The entrypoint for the zoe-master script. :return: int """ config.load_configuration(test_conf) args = config.get_conf() log_args = { 'level': logging.DEBUG if args.debug else logging.INFO, 'format': LOG_FORMAT } if args.log_file != "stderr": log_args['filename'] = args.log_file logging.basicConfig(**log_args) ret = _check_configuration_sanity() if ret != 0: return ret log.info("Initializing DB manager") state = SQLManager(args) try: zoe_master.backends.interface.initialize_backend(state) except ZoeException as e: log.error('Cannot initialize backend: {}'.format(e.message)) return 1 metrics = StatsManager(state) metrics.start() log.info("Initializing scheduler") scheduler = getattr(zoe_master.scheduler, args.scheduler_class)(state, args.scheduler_policy, metrics) restart_resubmit_scheduler(state, scheduler) log.info("Starting ZMQ API server...") api_server = APIManager(metrics, scheduler, state) if config.get_conf().gelf_listener != 0: gelf_listener = GELFListener() else: gelf_listener = None try: api_server.loop() except KeyboardInterrupt: pass except Exception: log.exception('Fatal error in API loop') finally: scheduler.quit() api_server.quit() zoe_master.backends.interface.shutdown_backend() metrics.quit() if gelf_listener is not None: gelf_listener.quit()
def zoe_web_main(test_conf=None) -> int: """ This is the entry point for the Zoe Web script. :return: int """ config.load_configuration(test_conf) args = config.get_conf() log_args = { 'level': logging.DEBUG if args.debug else logging.INFO, 'format': LOG_FORMAT } if args.log_file != "stderr": log_args['filename'] = args.log_file logging.basicConfig(**log_args) logging.getLogger("MARKDOWN").setLevel(logging.WARNING) logging.getLogger("tornado").setLevel(logging.WARNING) if config.get_conf( ).auth_type == 'ldap' and not zoe_api.auth.ldap.LDAP_AVAILABLE: log.error( "LDAP authentication requested, but 'pyldap' module not installed." ) return 1 sql_manager = zoe_lib.state.SQLManager(config.get_conf()) sql_manager.init_db() master_api = zoe_api.master_api.APIManager() api_endpoint = zoe_api.api_endpoint.APIEndpoint(master_api, sql_manager) app_settings = { 'static_path': os.path.join(os.path.dirname(__file__), "web", "static"), 'template_path': os.path.join(os.path.dirname(__file__), "web", "templates"), 'cookie_secret': config.get_conf().cookie_secret, 'debug': args.debug } app = Application( zoe_api.web.web_init(api_endpoint) + zoe_api.rest_api.api_init(api_endpoint), **app_settings) JinjaApp.init_app(app) log.info("Starting HTTP server...") http_server = HTTPServer(app) http_server.bind(args.listen_port, args.listen_address) http_server.start(num_processes=1) try: IOLoop.current().start() except KeyboardInterrupt: print("CTRL-C detected, terminating") return 0
def _check_configuration_sanity(): if not os.path.exists( os.path.join(config.get_conf().workspace_base_path, config.get_conf().workspace_deployment_path)): log.error('Workspace base directory does not exist: {}'.format( os.path.join(config.get_conf().workspace_base_path, config.get_conf().workspace_deployment_path))) return 1 return 0
def execution_endpoints(self, user: zoe_lib.state.User, execution: zoe_lib.state.Execution): """Return a list of the services and public endpoints available for a certain execution.""" services_info = [] endpoints = [] for service in execution.services: services_info.append(self.service_by_id(user, service.id)) for port in service.ports: if port.external_ip is not None: if zoe_lib.config.get_conf( ).traefik_zk_ips is None or not port.enable_proxy: endpoint = port.url_template.format( **{ "ip_port": port.external_ip + ":" + str(port.external_port) }) endpoint_ext = None else: endpoint_ext = '{}/{}'.format( zoe_lib.config.get_conf().traefik_base_url, port.proxy_key()) endpoint = port.url_template.format( **{ "ip_port": port.external_ip + ":" + str(port.external_port), "proxy_path": endpoint_ext }) endpoints.append( (port.readable_name, endpoint, endpoint_ext)) if get_conf().kube_ingress_controller.upper( ) == 'YES' and port.enable_proxy: endpoint = port.url_template.format( **{ "ip_port": str(service.name) + "-" + str(execution.id) + "-" + get_conf().deployment_name + get_conf().kube_ingress_url_suffix, "proxy_path": "" }) endpoint_ext = port.url_template.format( **{ "ip_port": str(service.name) + "-" + str(execution.id) + "-" + get_conf().deployment_name + get_conf().kube_ingress_url_suffix, "proxy_path": "" }) endpoints.append( (port.readable_name, endpoint, endpoint_ext)) return services_info, endpoints
def __init__(self, state): super().__init__(name='metrics', daemon=True) self.state = state self.deployment_name = get_conf().deployment_name self.stop = threading.Event() self._current_platform_stats = None if get_conf().kairosdb_enable: self.usage_metrics = KairosDBInMetrics() else: self.usage_metrics = None
def _set_parameters(self, app_descr, params): for param in params: argument_name = param.name + '-' + param.kind if param.kind == 'environment': for service in app_descr['services']: for env in service['environment']: if env[0] == param.name: env[1] = self.get_argument(argument_name) elif param.kind == 'command': for service in app_descr['services']: if service['name'] == param.name: service['command'] = self.get_argument(argument_name) break elif param.kind == 'service_count': for service in app_descr['services']: if service['name'] == param.name: service['total_count'] = int( self.get_argument(argument_name)) service['essential_count'] = int( self.get_argument(argument_name)) else: log.warning('Unknown parameter kind: {}, ignoring...'.format( param.kind)) for service in app_descr['services']: argument_name = service['name'] + '-resource_memory_min' try: self.get_argument(argument_name) except MissingArgumentError: pass else: if float(self.get_argument( argument_name)) >= get_conf().max_memory_limit: val = int(get_conf().max_memory_limit * (1024**3)) else: val = int( float(self.get_argument(argument_name)) * (1024**3)) service["resources"]["memory"]["min"] = val argument_name = service['name'] + '-resource_cores_min' try: self.get_argument(argument_name) except MissingArgumentError: pass else: if float(self.get_argument( argument_name)) >= get_conf().max_core_limit: val = get_conf().max_core_limit else: val = float(self.get_argument(argument_name)) service["resources"]["cores"]["min"] = val break return app_descr
def get_auth(handler: tornado.web.RequestHandler): """Try to authenticate a request.""" if handler.get_secure_cookie('zoe'): cookie_val = str(handler.get_secure_cookie('zoe')) uid, role = cookie_val[2:-1].split('.') log.debug( 'Authentication done using cookie (user {} from {} for {})'.format( uid, handler.request.remote_ip, handler.request.path)) if role == "guest": raise ZoeRestAPIException( 'Guest users cannot use the API, ask for a role upgrade', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) return uid, role auth_header = handler.request.headers.get('Authorization') if auth_header is None or not (auth_header.startswith('Basic ') or auth_header.startswith('Bearer ')): raise ZoeRestAPIException( 'missing or wrong authentication information', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) # Process for authentication with username, password else: auth_decoded = base64.decodebytes(bytes(auth_header[6:], 'ascii')).decode('utf-8') username, password = auth_decoded.split(':', 2) if get_conf().auth_type == 'text': authenticator = PlainTextAuthenticator() # type: BaseAuthenticator elif get_conf().auth_type == 'ldap': authenticator = LDAPAuthenticator( sasl=False) # type: BaseAuthenticator elif get_conf().auth_type == 'ldapsasl': authenticator = LDAPAuthenticator(sasl=True) # type: BaseAuthenticator else: raise ZoeException( 'Configuration error, unknown authentication method: {}'.format( get_conf().auth_type)) uid, role = authenticator.auth(username, password) if uid is None: raise ZoeRestAPIException( 'missing or wrong authentication information', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) log.debug( 'Authentication done using auth-mechanism (user {} from {} for {})'. format(uid, handler.request.remote_ip, handler.request.path)) if role == "guest": raise ZoeRestAPIException( 'Guest users cannot use the API, ask for a role upgrade', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) return uid, role
def get(self, execution_id): """Gather details about an execution.""" uid, role = get_auth(self) if uid is None: self.redirect(self.get_argument('next', u'/login')) return e = self.api_endpoint.execution_by_id(uid, role, execution_id) services_info, endpoints = self.api_endpoint.execution_endpoints( uid, role, e) endpoints = self.api_endpoint.execution_endpoints(uid, role, e)[1] template_vars = { "uid": uid, "role": role, "e": e, "services_info": services_info, "endpoints": endpoints, } if get_conf().enable_plots and e.time_start is not None: grafana_url_template = 'http://bigfoot-m2.eurecom.fr/grafana/dashboard/db/zoe-executions?orgId=1&from={}&to={}&var-execution_id={}&refresh=1y' if e.time_end is None: e_time_end = int(time.time() * 1000) else: e_time_end = int((e.time_end - datetime.datetime(1970, 1, 1)) / datetime.timedelta(seconds=1) * 1000) e_time_start = int((e.time_start - datetime.datetime(1970, 1, 1)) / datetime.timedelta(seconds=1) * 1000) template_vars['grafana_url'] = grafana_url_template.format( e_time_start, e_time_end, execution_id) self.render('execution_inspect.html', **template_vars)
def inspect_service(self, name) -> Dict[str, Any]: """Get information of a specific service.""" try: service_list = pykube.Service.objects( self.api).filter(namespace=get_conf().kube_namespace) service = service_list.get_by_name(name) srv_info = service.obj info = { 'service_name': name, 'port_forwarding': [] # type: List[Dict] } if 'clusterIP' in srv_info['spec']: info['clusterIP'] = srv_info['spec']['clusterIP'] length = len(srv_info['spec']['ports']) info['port_forwarding'] = [{} for _ in range(length)] for i in range(length): # type: int info['port_forwarding'][i]['port'] = srv_info['spec']['ports'][ i]['port'] info['port_forwarding'][i]['nodePort'] = srv_info['spec'][ 'ports'][i]['nodePort'] except Exception as ex: log.error(ex) info = None return info
def run(self): """The thread loop.""" log.info("Checker thread started") while not self.stop: try: swarm = SwarmClient() except ZoeException as e: log.error(str(e)) time.sleep(CHECK_INTERVAL) continue service_list = self.state.services.select() try: container_list = swarm.list(only_label={'zoe_deployment_name': get_conf().deployment_name}) except ZoeException: continue containers = {} for cont in container_list: containers[cont['id']] = cont services = {} for serv in service_list: services[serv.backend_id] = serv for service in service_list: assert isinstance(service, Service) if service.backend_id in containers: self._update_service_status(service, containers[service.backend_id]) else: if service.backend_status == service.BACKEND_DESTROY_STATUS: continue else: service.set_backend_status(service.BACKEND_DESTROY_STATUS) time.sleep(CHECK_INTERVAL)
def allocate_elastic(self, execution: Execution) -> bool: """Try to find an allocation for elastic services""" at_least_one_allocated = False for service in execution.elastic_services: if service.status == service.ACTIVE_STATUS and service.backend_status != service.BACKEND_DIE_STATUS: continue candidate_nodes = [] reasons = '' for node_id_, node in self.nodes.items(): if node.service_fits(service): candidate_nodes.append(node) else: reasons += 'node {}: {} ## '.format( node.name, node.service_why_unfit(service)) if len(candidate_nodes) == 0: # this service does not fit anywhere log.info('Cannot fit elastic service {} anywhere, reasons: {}'. format(service.id, reasons)) continue log.debug('Node selection for service {} with {} policy'.format( service.id, get_conf().placement_policy)) selected_node = self._select_node_policy(candidate_nodes) selected_node.service_add(service) service.set_runnable() at_least_one_allocated = True return at_least_one_allocated
def __init__(self, execution: Execution, service: Service, env_subst_dict): self.name = service.unique_name self.hostname = service.dns_name self.backend_host = service.backend_host if service.resource_reservation.memory.min is None: self.memory_limit = None else: self.memory_limit = service.resource_reservation.memory if self.memory_limit.max > get_conf().max_memory_limit * (1024 ** 3): self.memory_limit.max = get_conf().max_memory_limit * (1024 ** 3) if service.resource_reservation.cores.min is None: self.core_limit = None else: self.core_limit = service.resource_reservation.cores if self.core_limit.max > get_conf().max_core_limit: self.core_limit = get_conf().max_core_limit self.labels = { 'zoe.execution.name': execution.name, 'zoe.execution.id': str(execution.id), 'zoe.service.name': service.name, 'zoe.service.id': str(service.id), 'zoe.owner': execution.user_id, 'zoe.deployment_name': get_conf().deployment_name, 'zoe.type': 'service_{}'.format('essential' if service.essential else 'elastic'), 'zoe.zapp_size': execution.size } if service.is_monitor: self.labels['zoe_monitor'] = 'true' else: self.labels['zoe_monitor'] = 'false' self.labels = zoe_master.backends.common.gen_labels(service, execution) self.environment = service.environment + zoe_master.backends.common.gen_environment(execution, service, env_subst_dict) self.volumes = zoe_master.backends.common.gen_volumes(service, execution) self.command = service.command self.work_dir = service.work_dir self.image_name = service.image_name self.ports = [] for port in service.ports: self.ports.append(BackendPort(port.internal_number, port.protocol))
def service_logs(self, user: zoe_lib.state.User, service_id): """Retrieve the logs for the given service. If stream is True, a file object is returned, otherwise the log contents as a str object. """ service = self.sql.services.select(id=service_id, only_one=True) if service is None: raise zoe_api.exceptions.ZoeNotFoundException('No such service') if service.user_id != user.id and not user.role.can_operate_others: raise zoe_api.exceptions.ZoeAuthException() path = os.path.join(get_conf().service_logs_base_path, get_conf().deployment_name, str(service.execution_id), service.name + '.txt') if not os.path.exists(path): raise zoe_api.exceptions.ZoeNotFoundException( 'Service log not available') return open(path, encoding='utf-8')
def __init__(self, state: SQLManager) -> None: super().__init__() self.setName('checker') self.stop = False self.state = state self.setDaemon(True) self.kube = KubernetesClient(get_conf()) self.start()
def __init__(self, metrics: BaseMetricSender, scheduler: ZoeScheduler, state: SQLManager) -> None: self.context = zmq.Context() self.zmq_s = self.context.socket(zmq.REP) self.listen_uri = config.get_conf().api_listen_uri self.zmq_s.bind(self.listen_uri) self.debug_has_replied = False self.metrics = metrics self.scheduler = scheduler self.state = state
def main(): """ The entrypoint for the zoe-master script. :return: int """ config.load_configuration() args = config.get_conf() if args.debug: logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) else: logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) if config.get_conf().influxdb_enable: metrics = InfluxDBMetricSender(config.get_conf().deployment_name, config.get_conf().influxdb_url, config.get_conf().influxdb_dbname) else: metrics = LogMetricSender(config.get_conf().deployment_name) log.info("Initializing DB manager") state = SQLManager(args) log.info("Initializing scheduler") scheduler = ZoeScheduler() monitor = ZoeMonitor(state) checker = ZoeSwarmChecker(state) restart_resubmit_scheduler(state, scheduler) log.info("Starting ZMQ API server...") api_server = APIManager(metrics, scheduler, state) try: api_server.loop() except KeyboardInterrupt: pass except Exception: log.exception('fatal error') finally: scheduler.quit() monitor.quit() checker.quit() api_server.quit() metrics.quit()
def zoe_web_main() -> int: """ This is the entry point for the Zoe Web script. :return: int """ config.load_configuration() args = config.get_conf() if args.debug: logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) else: logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) if config.get_conf().auth_type == 'ldap' and not zoe_api.auth.ldap.LDAP_AVAILABLE: log.error("LDAP authentication requested, but 'pyldap' module not installed.") return 1 zoe_api.db_init.init() api_endpoint = zoe_api.api_endpoint.APIEndpoint() app_settings = { 'static_path': os.path.join(os.path.dirname(__file__), "web", "static"), 'template_path': os.path.join(os.path.dirname(__file__), "web", "templates"), # 'debug': args.debug } app = Application(zoe_api.web.web_init(api_endpoint) + zoe_api.rest_api.api_init(api_endpoint), **app_settings) JinjaApp.init_app(app) log.info("Starting HTTP server...") http_server = HTTPServer(app) http_server.bind(args.listen_port, args.listen_address) http_server.start(num_processes=1) retry_cb = PeriodicCallback(api_endpoint.retry_submit_error_executions, 30000) retry_cb.start() retry_cb = PeriodicCallback(api_endpoint.cleanup_dead_executions, 60000) retry_cb.start() try: IOLoop.current().start() except KeyboardInterrupt: print("CTRL-C detected, terminating") return 0
def get(self): """HTTP GET method.""" ret = { 'version': ZOE_VERSION, 'api_version': ZOE_API_VERSION, 'application_format_version': ZOE_APPLICATION_FORMAT_VERSION, 'deployment_name': get_conf().deployment_name } self.write(ret)
def run(self): """The thread loop.""" log.info("Monitor thread started") swarm = SwarmClient(get_conf()) while True: try: swarm.event_listener(lambda x: self._event_cb(x)) except Exception: log.exception('Exception in monitor thread') time.sleep(1) # wait a bit before retrying the connection
def _init(execution: Execution): if get_conf().service_log_path == '': return None base_path = _path_from_execution(execution) try: os.makedirs(base_path, exist_ok=True) except (OSError, PermissionError): log.exception('Error creating the directory at path: {}'.format(base_path)) return None return base_path
def service_logs(self, uid, role, service_id, stream=True): """Retrieve the logs for the given service.""" service = self.sql.service_list(id=service_id, only_one=True) if service is None: raise zoe_api.exceptions.ZoeNotFoundException('No such service') if service.user_id != uid and role != 'admin': raise zoe_api.exceptions.ZoeAuthException() if service.docker_id is None: raise zoe_api.exceptions.ZoeNotFoundException('Container is not running') swarm = SwarmClient(get_conf()) return swarm.logs(service.docker_id, stream)
def get_auth(handler: tornado.web.RequestHandler): """Try to authenticate a request.""" auth_header = handler.request.headers.get('Authorization') if auth_header is None or not auth_header.startswith('Basic '): raise ZoeRestAPIException('missing or wrong authentication information', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) auth_decoded = base64.decodebytes(bytes(auth_header[6:], 'ascii')).decode('utf-8') username, password = auth_decoded.split(':', 2) if get_conf().auth_type == 'text': authenticator = PlainTextAuthenticator() # type: BaseAuthenticator elif get_conf().auth_type == 'ldap': authenticator = LDAPAuthenticator() else: raise ZoeException('Configuration error, unknown authentication method: {}'.format(get_conf().auth_type)) uid, role = authenticator.auth(username, password) if uid is None: raise ZoeRestAPIException('missing or wrong authentication information', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) return uid, role
def get_auth(handler: ZoeRequestHandler): """Try to authenticate a request.""" auth_header = handler.request.headers.get('Authorization') if auth_header is None or not auth_header.startswith('Basic '): raise zoe_api.exceptions.ZoeAuthException auth_decoded = base64.decodebytes(bytes(auth_header[6:], 'ascii')).decode('utf-8') username, password = auth_decoded.split(':', 2) if get_conf().auth_type == 'text': authenticator = PlainTextAuthenticator() # type: BaseAuthenticator elif get_conf().auth_type == 'ldap': authenticator = LDAPAuthenticator() else: raise zoe_api.exceptions.ZoeException('Configuration error, unknown authentication method: {}'.format(get_conf().auth_type)) uid, role = authenticator.auth(username, password) if uid is None: raise zoe_api.exceptions.ZoeAuthException return uid, role
def run(self): """The thread loop.""" log.info("Checker thread started") swarm = SwarmClient(get_conf()) while not self.stop: service_list = self.state.service_list() container_list = swarm.list(only_label={'zoe.deployment_name': get_conf().deployment_name}) for service in service_list: assert isinstance(service, Service) if service.docker_status == service.DOCKER_DESTROY_STATUS or service.docker_status == service.DOCKER_DIE_STATUS: continue found = False for container in container_list: if container['id'] == service.docker_id: found = True if container['status'] == 'exited': log.info('resetting status of service {}, died with no event'.format(service.name)) service.set_docker_status(service.DOCKER_DIE_STATUS) if not found: service.set_docker_status(service.DOCKER_DESTROY_STATUS) time.sleep(CHECK_INTERVAL)
def main(): """The main entrypoint function.""" conf = load_configuration() config.load_configuration(conf) args = config.get_conf() if args.debug: logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) else: logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) logging.getLogger('kazoo').setLevel(logging.WARNING) logging.getLogger('requests').setLevel(logging.WARNING) logging.getLogger('urllib3').setLevel(logging.WARNING) logging.getLogger('docker').setLevel(logging.INFO) logging.getLogger("tornado").setLevel(logging.DEBUG) state = FakeSQLManager() zapp_description = json.load(args.jsonfile) print('Validating zapp description...') zoe_lib.applications.app_validate(zapp_description) exec_id = state.execution_new('test', 'fake_user', zapp_description) e = state.execution_list(only_one=True, id=exec_id) _digest_application_description(state, e) print('Zapp digested, starting containers...') execution_to_containers(e) print('Giving the containers a few seconds to start...') time.sleep(5) swarm = SwarmClient(args) for service in e.services: print("Service {}, docker ID: {}".format(service.name, service.docker_id)) logs = swarm.logs(service.docker_id, False) logs = logs.decode('utf-8').split('\n') for log_line in logs[-10:]: print(log_line) print("Execution as been started, press CTRL-C to terminate it") try: while True: time.sleep(1) except KeyboardInterrupt: pass print('Terminating...') terminate_execution(e)
def init(): """DB init entrypoint.""" dsn = 'dbname=' + get_conf().dbname + \ ' user='******' password='******' host=' + get_conf().dbhost + \ ' port=' + str(get_conf().dbport) conn = psycopg2.connect(dsn) cur = conn.cursor() version_table(cur) cur.execute('SET search_path TO {},public'.format(get_conf().deployment_name)) if not check_schema_version(cur, get_conf().deployment_name): create_tables(cur) conn.commit() cur.close() conn.close() return
def _container_event(self, event: dict): if 'zoe.deployment_name' not in event['Actor']['Attributes']: return if event['Actor']['Attributes']['zoe.deployment_name'] != get_conf().deployment_name: return service_id = event['Actor']['Attributes']['zoe.service.id'] # type: int service = self.state.service_list(only_one=True, id=service_id) if 'exec' in event['Action']: pass elif 'create' in event['Action']: service.set_docker_status(service.DOCKER_CREATE_STATUS) elif 'start' in event['Action']: service.set_docker_status(service.DOCKER_START_STATUS) elif 'die' in event['Action'] or 'kill' in event['Action'] or 'stop' in event['Action']: service.set_docker_status(service.DOCKER_DIE_STATUS) elif 'oom' in event['Action']: service.set_docker_status(service.DOCKER_OOM_STATUS) log.warning('Service {} got killed by an OOM condition'.format(service.id)) elif 'destroy' in event['Action']: service.set_docker_status(service.DOCKER_DESTROY_STATUS) else: log.debug('Unmanaged container action: {}'.format(event['Action']))
def save(execution: Execution): """Save the logs of the service specified as argument""" path = _init(execution) if path is None: return for service in execution.services: fname = service.name + '.txt' fpath = os.path.join(path, fname) swarm = SwarmClient(get_conf()) log_gen = swarm.logs(service.docker_id, stream=True, follow=False) if log_gen is None: _shutdown() return try: with open(fpath, 'wb') as out_fp: for line in log_gen: out_fp.write(line) except FileNotFoundError: log.error("Could not create file {}".format(fpath)) _shutdown()
def __init__(self): self.base_path = os.path.join(config.get_conf().workspace_base_path, config.get_conf().workspace_deployment_path)
def __init__(self): self.context = zmq.Context(1) self.zmq_s = None self.poll = zmq.Poller() self.master_uri = config.get_conf().master_url # type: str self._connect()
def __init__(self): self.master = zoe_api.master_api.APIManager() self.sql = zoe_lib.sql_manager.SQLManager(get_conf())
def dns_name(self): """Getter for the DNS name of this service as it will be registered in Docker's DNS.""" return "{}-{}-{}".format(self.name, self.execution_id, get_conf().deployment_name)
def __init__(self): self.connection = ldap.initialize(get_conf().ldap_server_uri) self.base_dn = get_conf().ldap_base_dn
def _path_from_execution(execution: Execution): return os.path.join(get_conf().service_log_path, get_conf().deployment_name, str(execution.id))
def __init__(self): self.passwd_file = get_conf().auth_file if not os.access(self.passwd_file, os.R_OK): raise zoe_api.exceptions.ZoeNotFoundException('Password file not found at: {}'.format(self.passwd_file))