Пример #1
0
 def ip_address(self):
     """Getter for the service IP address, queries Swarm as the IP address changes outside our control."""
     if self.docker_status != self.DOCKER_START_STATUS:
         return {}
     swarm = SwarmClient(get_conf())
     s_info = swarm.inspect_container(self.docker_id)
     return s_info['ip_address'][get_conf().overlay_network_name]
Пример #2
0
 def ip_address(self):
     """Getter for the service IP address, queries Swarm as the IP address changes outside our control."""
     if self.docker_status != self.DOCKER_START_STATUS:
         return {}
     swarm = SwarmClient(get_conf())
     s_info = swarm.inspect_container(self.docker_id)
     return s_info['ip_address'][get_conf().overlay_network_name]
Пример #3
0
    def loop(self):
        assert isinstance(config.singletons['sql_manager'],
                          zoe_lib.sql_manager.SQLManager)
        while True:
            message = self.zmq_s.recv_json()
            self.debug_has_replied = False
            start_time = time.time()
            if message['command'] == 'execution_start':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(
                    id=exec_id, only_one=True)
                if execution is None:
                    self._reply_error('Execution ID {} not found'.format(
                        message['exec_id']))
                else:
                    execution.set_scheduled()
                    self._reply_ok()
                    zoe_master.execution_manager.execution_submit(execution)
            elif message['command'] == 'execution_terminate':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(
                    id=exec_id, only_one=True)
                if execution is None:
                    self._reply_error('Execution ID {} not found'.format(
                        message['exec_id']))
                else:
                    execution.set_cleaning_up()
                    self._reply_ok()
                    zoe_master.execution_manager.execution_terminate(execution)
            elif message['command'] == 'execution_delete':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(
                    id=exec_id, only_one=True)
                if execution is not None:
                    zoe_master.execution_manager.execution_delete(execution)
                self._reply_ok()
            elif message['command'] == 'service_inspect':
                service_id = message['service_id']
                service = config.singletons['sql_manager'].service_list(
                    id=service_id, only_one=True)
                if service is None:
                    self._reply_error('no such service')
                else:
                    swarm = SwarmClient(config.get_conf())
                    info = swarm.inspect_container(service.docker_id)
                    self._reply_ok(info)
            else:
                log.error('Unknown command: {}'.format(message['command']))
                self._reply_error('unknown command')

            if not self.debug_has_replied:
                self._reply_error('bug')
                raise ZoeException('BUG: command {} does not fill a reply')

            config.singletons['metric'].metric_api_call(
                start_time, message['command'])
Пример #4
0
    def loop(self):
        assert isinstance(config.singletons['sql_manager'], zoe_lib.sql_manager.SQLManager)
        while True:
            message = self.zmq_s.recv_json()
            self.debug_has_replied = False
            start_time = time.time()
            if message['command'] == 'execution_start':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True)
                if execution is None:
                    self._reply_error('Execution ID {} not found'.format(message['exec_id']))
                else:
                    execution.set_scheduled()
                    self._reply_ok()
                    zoe_master.execution_manager.execution_submit(execution)
            elif message['command'] == 'execution_terminate':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True)
                if execution is None:
                    self._reply_error('Execution ID {} not found'.format(message['exec_id']))
                else:
                    execution.set_cleaning_up()
                    self._reply_ok()
                    zoe_master.execution_manager.execution_terminate(execution)
            elif message['command'] == 'execution_delete':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True)
                if execution is not None:
                    zoe_master.execution_manager.execution_delete(execution)
                self._reply_ok()
            elif message['command'] == 'service_inspect':
                service_id = message['service_id']
                service = config.singletons['sql_manager'].service_list(id=service_id, only_one=True)
                if service is None:
                    self._reply_error('no such service')
                else:
                    swarm = SwarmClient(config.get_conf())
                    info = swarm.inspect_container(service.docker_id)
                    self._reply_ok(info)
            else:
                log.error('Unknown command: {}'.format(message['command']))
                self._reply_error('unknown command')

            if not self.debug_has_replied:
                self._reply_error('bug')
                raise ZoeException('BUG: command {} does not fill a reply')

            config.singletons['metric'].metric_api_call(start_time, message['command'])
Пример #5
0
class PlatformManager:
    """
    :type swarm: SwarmClient
    :type scheduler: ZoeScheduler
    :type state_manager: StateManager
    """
    def __init__(self, sched_policy_class):
        self.swarm = SwarmClient(get_conf())
        self.scheduler = ZoeScheduler(self, sched_policy_class)
        self.state_manager = None

    def execution_submitted(self, execution: execution_module.Execution):
        execution.set_scheduled()
        self.scheduler.incoming(execution)

    def execution_start(self, execution: execution_module.Execution) -> bool:
        try:
            self._application_to_containers(execution)
        except ZoeException:
            self.execution_terminate(execution, reason='error')
            raise
        execution.set_started()
        self.state_manager.state_updated()

    def _application_to_containers(self, execution: execution_module.Execution):
        for process in execution.application.processes:
            self._spawn_process(execution, process)

    def _spawn_process(self, execution: execution_module.Execution, process_description: application_module.Process) -> bool:
        copts = ContainerOptions()
        copts.name = get_conf().container_name_prefix + '-' + process_description.name + "-{}".format(execution.id)
        copts.set_memory_limit(process_description.required_resources['memory'])
        copts.network_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, execution.owner.id)
        container_id = self.state_manager.gen_id()
        copts.labels = {
            'zoe.{}'.format(get_conf().container_name_prefix): '',
            'zoe.execution.id': str(execution.id),
            'zoe.execution.name': execution.name,
            'zoe.container.id': str(container_id),
            'zoe.container.name': process_description.name,
            'zoe.owner': execution.owner.name,
            'zoe.prefix': get_conf().container_name_prefix,
            'zoe.type': 'app_process'
        }
        if process_description.monitor:
            copts.labels['zoe.monitor'] = ''
        else:
            copts.labels['zoe.normal'] = ''
        copts.restart = not process_description.monitor  # Monitor containers should not restart

        # Generate a dictionary containing the current cluster status (before the new container is spawned)
        # This information is used to substitute template strings in the environment variables
        subst_dict = {
            "execution_id": str(execution.id),
            "user_id": str(execution.owner.id),
            'user_name': execution.owner.name,
            'name_prefix': get_conf().container_name_prefix
        }
        for env_name, env_value in process_description.environment:
            try:
                env_value = env_value.format(**subst_dict)
            except KeyError:
                raise ZoeException("cannot find variable to substitute in expression {}".format(env_value))
            copts.add_env_variable(env_name, env_value)

        # The same dictionary is used for templates in the command
        if process_description.command is not None:
            copts.set_command(process_description.command.format(**subst_dict))

        cont_info = self.swarm.spawn_container(process_description.docker_image, copts)
        container = container_module.Container(self.state_manager)
        container.docker_id = cont_info["docker_id"]
        container.ip_address = cont_info["ip_address"]
        container.name = copts.name
        container.is_monitor = process_description.monitor
        container.ports = [p.to_dict() for p in process_description.ports]

        container.id = container_id
        execution.containers.append(container)
        container.execution = execution

        self.swarm.connect_to_network(container.docker_id, 'eeef9754c16790a29d5210c5d9ad8e66614ee8a6229b6dc6f779019d46cec792')

        self.state_manager.new('container', container)
        return True

    def execution_terminate(self, execution: execution_module.Execution, reason):
        """
        :param execution: The execution to be terminated
        :param reason: termination reason
        :return:
        """
        logs = []
        if len(execution.containers) > 0:
            containers = execution.containers.copy()
            for c in containers:
                assert isinstance(c, container_module.Container)
                l = self.log_get(c.id)
                if l is not None:
                    logs.append((c.name, l))
                self.swarm.terminate_container(c.docker_id, delete=True)
                self.state_manager.delete('container', c.id)
                log.info('Container {} terminated'.format(c.name))
            execution.store_logs(logs)

        if reason == 'error':
            execution.set_error()
        elif reason == 'finished':
            execution.set_finished()
        else:
            execution.set_terminated()
        self.scheduler.execution_terminate(execution)

    def start_gateway_container(self, user):
        copts = ContainerOptions()
        copts.name = '{}-gateway-{}'.format(get_conf().container_name_prefix, user.id)
        copts.network_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, user.id)
        copts.ports.append(1080)
        copts.labels = {
            'zoe.{}.gateway'.format(get_conf().container_name_prefix): '',
            'zoe.owner': user.name,
            'zoe.prefix': get_conf().container_name_prefix,
            'zoe.type': 'gateway'
        }
        copts.restart = True
        if user.role == 'guest':
            image = get_conf().private_registry + '/zoerepo/guest-gateway'
        else:
            image = get_conf().private_registry + '/zoerepo/guest-gateway'  # TODO: create an image with ssh
        cont_info = self.swarm.spawn_container(image, copts)
        if cont_info is None:
            raise ZoeException('Cannot create user gateway container')
        user.gateway_docker_id = cont_info['docker_id']
        user.set_gateway_urls(cont_info)
        self.swarm.connect_to_network(user.gateway_docker_id, 'eeef9754c16790a29d5210c5d9ad8e66614ee8a6229b6dc6f779019d46cec792')

    def kill_gateway_container(self, user):
        self.swarm.terminate_container(user.gateway_docker_id, delete=True)
        user.gateway_docker_id = None
        user.gateway_urls = []

    def create_user_network(self, user):
        log.info('Creating a new network for user {}'.format(user.id))
        net_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, user.id)
        net_id = self.swarm.network_create(net_name)
        user.network_id = net_id

    def remove_user_network(self, user):
        log.info('Removing network for user {}'.format(user.name))
        self.swarm.network_remove(user.network_id)

    def log_get(self, container_id: int) -> str:
        container = self.state_manager.get_one('container', id=container_id)
        if container is None:
            return ''
        else:
            return self.swarm.log_get(container.docker_id)

    def container_stats(self, container_id: int) -> ContainerStats:
        container = self.state_manager.get_one('container', id=container_id)
        return self.swarm.stats(container.docker_id)

    def is_container_alive(self, container: container_module.Container) -> bool:
        ret = self.swarm.inspect_container(container.docker_id)
        if ret is None:
            return False
        return ret["running"]

    def swarm_stats(self) -> SwarmStats:
        # TODO implement some caching
        return self.swarm.info()

    def scheduler_stats(self) -> SchedulerStats:
        return self.scheduler.scheduler_policy.stats()

    def check_state_swarm_consistency(self):
        state_changed = False
        users = self.state_manager.get('user')
        networks = self.swarm.network_list('{}-usernet-'.format(get_conf().container_name_prefix))
        gateways = self.swarm.list(['zoe.{}.gateway'.format(get_conf().container_name_prefix)])

        users_no_network = []
        users_no_gateway = []
        networks_to_delete = []
        gateways_to_delete = []

        for u in users:
            if u.network_id is None:
                log.error('state inconsistency: user {} has no network'.format(u.name))
                users_no_network.append(u)
            elif u.network_id not in [x['id'] for x in networks]:
                log.error('state inconsistency: user {} has an invalid network'.format(u.name))
                u.network_id = None
                users_no_network.append(u)

            if u.gateway_docker_id is None:
                log.error('state inconsistency: user {} has no gateway'.format(u.name))
                users_no_gateway.append(u)
            elif u.gateway_docker_id not in [x['id'] for x in gateways]:
                log.error('state inconsistency: user {} has an invalid gateway container ID'.format(u.name))
                u.gateway_docker_id = None
                users_no_gateway.append(u)

        duplicate_check = set()
        for n in networks:
            try:
                uid = int(n['name'][len('{}-usernet-'.format(get_conf().container_name_prefix)):])
            except ValueError:
                log.error('network {} does not belong to Zoe, bug?'.format(n['name']))
                networks_to_delete.append(n['id'])
                continue
            if uid in duplicate_check:
                log.warning('state inconsistency: found two networks for the same user')
                networks_to_delete.append(n['id'])
                continue
            duplicate_check.add(uid)
            user = self.state_manager.get_one('user', id=uid)
            if user is not None and user in users_no_network:
                user.network_id = n['id']
                users_no_network.remove(user)
                log.error('fixed: user {} linked to network {}'.format(user.name, n['name']))
                state_changed = True
                continue
            elif user is None:
                log.error('state inconsistency: found a network for user {} who no longer exists'.format(uid))
                networks_to_delete.append(n['id'])

        for g in gateways:
            try:
                uid = int(g['name'][len('{}-gateway-'.format(get_conf().container_name_prefix)):])
            except ValueError:
                log.error('container {} does not belong to Zoe, bug?'.format(g['name']))
                gateways_to_delete.append(g['id'])
                continue
            user = self.state_manager.get_one('user', id=uid)
            if user is not None and user in users_no_gateway:
                user.gateway_docker_id = g['id']
                users_no_gateway.remove(user)
                cont_info = self.swarm.inspect_container(g['id'])
                user.set_gateway_urls(cont_info)
                log.error('fixed: user {} linked to gateway {}'.format(user.name, g['name']))
                state_changed = True
                continue
            elif user is None:
                log.error('state inconsistency: found a gateway for user {} who no longer exists'.format(uid))
                gateways_to_delete.append(g['id'])

        # Fix all inconsistencies found
        for g in gateways_to_delete:
            log.error('fixed: terminating orphan gateway container {}'.format(g[:8]))
            self.swarm.terminate_container(g, delete=True)
        for n in networks_to_delete:
            log.error('fixed: terminating orphan network {}'.format(n[:8]))
            self.swarm.network_remove(n)

        for u in users_no_network:
            log.error('fixed: creating network for user {}'.format(u.name))
            self.create_user_network(u)
        for u in users_no_gateway:
            log.error('fixed: creating gateway for user {}'.format(u.name))
            self.start_gateway_container(u)

        # ### Check executions and container consistency
        swarm_containers = self.swarm.list(only_label='zoe.{}'.format(get_conf().container_name_prefix))
        conts_state_to_delete = []
        for c_id, c in self.state_manager.containers.items():
            if c.docker_id not in [x['id'] for x in swarm_containers]:
                log.error('fixed: removing from state container {} that does not exist in Swarm'.format(c.name))
                conts_state_to_delete.append(c_id)
        for c_id in conts_state_to_delete:
            self.state_manager.delete('container', c_id)

        if state_changed or len(users_no_gateway) > 0 or len(users_no_network) > 0:
            self.state_manager.state_updated()