Пример #1
0
 def ip_address(self):
     """Getter for the service IP address, queries Swarm as the IP address changes outside our control."""
     if self.docker_status != self.DOCKER_START_STATUS:
         return {}
     swarm = SwarmClient(get_conf())
     s_info = swarm.inspect_container(self.docker_id)
     return s_info['ip_address'][get_conf().overlay_network_name]
Пример #2
0
def guest_check_thread(args):
    swarm = SwarmClient(args)

    while True:
        try:
            zoe_containers = swarm.list(
                {'zoe.deployment_name': get_conf().deployment_name})
            for c in zoe_containers:
                if 'Exited' in c['status']:
                    zoe_id = c['labels']['zoe.service.id']
                    try:
                        container_died(zoe_id)
                    except ZoeAPIException:
                        log.warning(
                            'Container ' + c['name'] +
                            ' has died, but Zoe does not know anything about it, deleting'
                        )
                        swarm.terminate_container(c['id'], delete=True)

            check_guests(swarm)

            time.sleep(get_conf().loop_time)

        except Exception:
            log.exception('Something bad happened')
Пример #3
0
def swarm_events_thread(args):
    swarm = SwarmClient(args)
    while True:
        try:
            swarm.event_listener(main_callback)
        except Exception:
            log.exception('Something bad happened')
Пример #4
0
 def ip_address(self):
     """Getter for the service IP address, queries Swarm as the IP address changes outside our control."""
     if self.docker_status != self.DOCKER_START_STATUS:
         return {}
     swarm = SwarmClient(get_conf())
     s_info = swarm.inspect_container(self.docker_id)
     return s_info['ip_address'][get_conf().overlay_network_name]
Пример #5
0
def main():
    """
    The entrypoint for the zoe-observer script.
    :return: int
    """
    load_configuration()
    args = get_conf()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG)

    else:
        logging.basicConfig(level=logging.INFO)

    logging.getLogger('kazoo').setLevel(logging.WARNING)
    logging.getLogger('requests').setLevel(logging.WARNING)
    logging.getLogger('docker').setLevel(logging.INFO)

    swarm = SwarmClient(args)

    while True:
        try:
            zoe_containers = swarm.list('zoe.{}'.format(get_conf().container_name_prefix))
            for c in zoe_containers:
                if 'Exited' in c['status']:
                    zoe_id = c['labels']['zoe.container_id']
                    container_died(zoe_id)

            check_guests(swarm)

            time.sleep(get_conf().loop_time)

        except KeyboardInterrupt:
            break
        except Exception:
            log.exception('Something bad happened')
Пример #6
0
def swarm_events_thread(args):
    swarm = SwarmClient(args)
    while True:
        try:
            swarm.event_listener(main_callback)
        except Exception:
            log.exception('Something bad happened')
Пример #7
0
    def loop(self):
        assert isinstance(config.singletons['sql_manager'],
                          zoe_lib.sql_manager.SQLManager)
        while True:
            message = self.zmq_s.recv_json()
            self.debug_has_replied = False
            start_time = time.time()
            if message['command'] == 'execution_start':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(
                    id=exec_id, only_one=True)
                if execution is None:
                    self._reply_error('Execution ID {} not found'.format(
                        message['exec_id']))
                else:
                    execution.set_scheduled()
                    self._reply_ok()
                    zoe_master.execution_manager.execution_submit(execution)
            elif message['command'] == 'execution_terminate':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(
                    id=exec_id, only_one=True)
                if execution is None:
                    self._reply_error('Execution ID {} not found'.format(
                        message['exec_id']))
                else:
                    execution.set_cleaning_up()
                    self._reply_ok()
                    zoe_master.execution_manager.execution_terminate(execution)
            elif message['command'] == 'execution_delete':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(
                    id=exec_id, only_one=True)
                if execution is not None:
                    zoe_master.execution_manager.execution_delete(execution)
                self._reply_ok()
            elif message['command'] == 'service_inspect':
                service_id = message['service_id']
                service = config.singletons['sql_manager'].service_list(
                    id=service_id, only_one=True)
                if service is None:
                    self._reply_error('no such service')
                else:
                    swarm = SwarmClient(config.get_conf())
                    info = swarm.inspect_container(service.docker_id)
                    self._reply_ok(info)
            else:
                log.error('Unknown command: {}'.format(message['command']))
                self._reply_error('unknown command')

            if not self.debug_has_replied:
                self._reply_error('bug')
                raise ZoeException('BUG: command {} does not fill a reply')

            config.singletons['metric'].metric_api_call(
                start_time, message['command'])
Пример #8
0
 def run(self):
     """The thread loop."""
     log.info("Monitor thread started")
     swarm = SwarmClient(get_conf())
     while True:
         try:
             swarm.event_listener(lambda x: self._event_cb(x))
         except Exception:
             log.exception('Exception in monitor thread')
         time.sleep(1)  # wait a bit before retrying the connection
Пример #9
0
 def service_logs(self, uid, role, service_id, stream=True):
     """Retrieve the logs for the given service."""
     service = self.sql.service_list(id=service_id, only_one=True)
     if service is None:
         raise zoe_api.exceptions.ZoeNotFoundException('No such service')
     if service.user_id != uid and role != 'admin':
         raise zoe_api.exceptions.ZoeAuthException()
     if service.docker_id is None:
         raise zoe_api.exceptions.ZoeNotFoundException('Container is not running')
     swarm = SwarmClient(get_conf())
     return swarm.logs(service.docker_id, stream)
Пример #10
0
def terminate_execution(execution: Execution) -> None:
    execution.set_cleaning_up()
    swarm = SwarmClient(get_conf())
    for s in execution.services:
        assert isinstance(s, Service)
        if s.docker_id is not None:
            s.set_terminating()
            swarm.terminate_container(s.docker_id, delete=True)
            s.set_inactive()
            log.debug('Service {} terminated'.format(s.name))
    execution.set_terminated()
Пример #11
0
 def service_logs(self, uid, role, service_id, stream=True):
     """Retrieve the logs for the given service."""
     service = self.sql.service_list(id=service_id, only_one=True)
     if service is None:
         raise zoe_api.exceptions.ZoeNotFoundException('No such service')
     if service.user_id != uid and role != 'admin':
         raise zoe_api.exceptions.ZoeAuthException()
     if service.docker_id is None:
         raise zoe_api.exceptions.ZoeNotFoundException('Container is not running')
     swarm = SwarmClient(get_conf())
     return swarm.logs(service.docker_id, stream)
Пример #12
0
 def run(self):
     """The thread loop."""
     log.info("Monitor thread started")
     swarm = SwarmClient(get_conf())
     while True:
         try:
             swarm.event_listener(lambda x: self._event_cb(x))
         except:
             log.exception('Exception in monitor thread')
         time.sleep(
             1
         )  # Usually we got disconnected, so wait a bit before retrying
Пример #13
0
def terminate_execution(execution: Execution) -> None:
    """Terminate an execution, making sure no containers are left in Swarm."""
    execution.set_cleaning_up()
    swarm = SwarmClient(get_conf())
    for service in execution.services:
        assert isinstance(service, Service)
        if service.docker_id is not None:
            service.set_terminating()
            swarm.terminate_container(service.docker_id, delete=True)
            service.set_inactive()
            log.debug('Service {} terminated'.format(service.name))
    execution.set_terminated()
Пример #14
0
def main():
    """The main entrypoint function."""
    conf = load_configuration()
    config.load_configuration(conf)
    args = config.get_conf()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
    else:
        logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)

    logging.getLogger('kazoo').setLevel(logging.WARNING)
    logging.getLogger('requests').setLevel(logging.WARNING)
    logging.getLogger('urllib3').setLevel(logging.WARNING)
    logging.getLogger('docker').setLevel(logging.INFO)
    logging.getLogger("tornado").setLevel(logging.DEBUG)

    state = FakeSQLManager()

    zapp_description = json.load(args.jsonfile)

    print('Validating zapp description...')
    zoe_lib.applications.app_validate(zapp_description)

    exec_id = state.execution_new('test', 'fake_user', zapp_description)
    e = state.execution_list(only_one=True, id=exec_id)
    _digest_application_description(state, e)

    print('Zapp digested, starting containers...')
    execution_to_containers(e)

    print('Giving the containers a few seconds to start...')
    time.sleep(5)

    swarm = SwarmClient(args)
    for service in e.services:
        print("Service {}, docker ID: {}".format(service.name, service.docker_id))
        logs = swarm.logs(service.docker_id, False)
        logs = logs.decode('utf-8').split('\n')
        for log_line in logs[-10:]:
            print(log_line)

    print("Execution as been started, press CTRL-C to terminate it")
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        pass

    print('Terminating...')
    terminate_execution(e)
Пример #15
0
    def loop(self):
        assert isinstance(config.singletons['sql_manager'], zoe_lib.sql_manager.SQLManager)
        while True:
            message = self.zmq_s.recv_json()
            self.debug_has_replied = False
            start_time = time.time()
            if message['command'] == 'execution_start':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True)
                if execution is None:
                    self._reply_error('Execution ID {} not found'.format(message['exec_id']))
                else:
                    execution.set_scheduled()
                    self._reply_ok()
                    zoe_master.execution_manager.execution_submit(execution)
            elif message['command'] == 'execution_terminate':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True)
                if execution is None:
                    self._reply_error('Execution ID {} not found'.format(message['exec_id']))
                else:
                    execution.set_cleaning_up()
                    self._reply_ok()
                    zoe_master.execution_manager.execution_terminate(execution)
            elif message['command'] == 'execution_delete':
                exec_id = message['exec_id']
                execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True)
                if execution is not None:
                    zoe_master.execution_manager.execution_delete(execution)
                self._reply_ok()
            elif message['command'] == 'service_inspect':
                service_id = message['service_id']
                service = config.singletons['sql_manager'].service_list(id=service_id, only_one=True)
                if service is None:
                    self._reply_error('no such service')
                else:
                    swarm = SwarmClient(config.get_conf())
                    info = swarm.inspect_container(service.docker_id)
                    self._reply_ok(info)
            else:
                log.error('Unknown command: {}'.format(message['command']))
                self._reply_error('unknown command')

            if not self.debug_has_replied:
                self._reply_error('bug')
                raise ZoeException('BUG: command {} does not fill a reply')

            config.singletons['metric'].metric_api_call(start_time, message['command'])
Пример #16
0
def guest_check_thread(args):
    swarm = SwarmClient(args)

    while True:
        try:
            zoe_containers = swarm.list({'zoe.deployment_name': get_conf().deployment_name})
            for c in zoe_containers:
                if 'Exited' in c['status']:
                    zoe_id = c['labels']['zoe.service.id']
                    try:
                        container_died(zoe_id)
                    except ZoeAPIException:
                        log.warning('Container ' + c['name'] + ' has died, but Zoe does not know anything about it, deleting')
                        swarm.terminate_container(c['id'], delete=True)

            check_guests(swarm)

            time.sleep(get_conf().loop_time)

        except Exception:
            log.exception('Something bad happened')
Пример #17
0
    def run(self):
        """The thread loop."""
        log.info("Checker thread started")
        swarm = SwarmClient(get_conf())
        while not self.stop:
            service_list = self.state.service_list()
            container_list = swarm.list(only_label={'zoe.deployment_name': get_conf().deployment_name})

            for service in service_list:
                assert isinstance(service, Service)
                if service.docker_status == service.DOCKER_DESTROY_STATUS or service.docker_status == service.DOCKER_DIE_STATUS:
                    continue
                found = False
                for container in container_list:
                    if container['id'] == service.docker_id:
                        found = True
                        if container['status'] == 'exited':
                            log.info('resetting status of service {}, died with no event'.format(service.name))
                            service.set_docker_status(service.DOCKER_DIE_STATUS)
                if not found:
                    service.set_docker_status(service.DOCKER_DESTROY_STATUS)

            time.sleep(CHECK_INTERVAL)
Пример #18
0
def save(execution: Execution):
    """Save the logs of the service specified as argument"""
    path = _init(execution)
    if path is None:
        return

    for service in execution.services:
        fname = service.name + '.txt'
        fpath = os.path.join(path, fname)

        swarm = SwarmClient(get_conf())
        log_gen = swarm.logs(service.docker_id, stream=True, follow=False)
        if log_gen is None:
            _shutdown()
            return
        try:
            with open(fpath, 'wb') as out_fp:
                for line in log_gen:
                    out_fp.write(line)
        except FileNotFoundError:
            log.error("Could not create file {}".format(fpath))

    _shutdown()
Пример #19
0
class StatsManager(threading.Thread):
    def __init__(self):
        super().__init__(name='stats', daemon=True)
        self.swarm = SwarmClient(get_conf())

        self._swarm_stats = None

    def run(self):
        log.info("Stats manager started")
        while True:
            try:
                self._swarm_stats = self.swarm.info()
            except:
                log.exception("Exception in stats thread")
            time.sleep(5)

    @property
    def swarm_stats(self):
        return self._swarm_stats
Пример #20
0
def _spawn_service(execution: Execution, service: Service,
                   env_subst_dict: dict):
    copts = DockerContainerOptions()
    copts.gelf_log_address = get_conf().gelf_address
    copts.name = service.dns_name
    copts.set_memory_limit(service.description['required_resources']['memory'])
    copts.network_name = get_conf().overlay_network_name
    copts.labels = {
        'zoe.execution.name': execution.name,
        'zoe.execution.id': str(execution.id),
        'zoe.service.name': service.name,
        'zoe.service.id': str(service.id),
        'zoe.owner': execution.user_id,
        'zoe.deployment_name': get_conf().deployment_name,
        'zoe.type': 'app_service'
    }
    if service.description['monitor']:
        copts.labels['zoe.monitor'] = 'true'
    else:
        copts.labels['zoe.monitor'] = 'false'
    copts.restart = not service.description[
        'monitor']  # Monitor containers should not restart

    _gen_environment(service, env_subst_dict, copts)

    for p in service.description['ports']:
        if p['expose']:
            copts.ports.append(p['port_number'])  # FIXME UDP ports?

    if 'volumes' in service.description:
        for path, mount_point, readonly in service.description['volumes']:
            copts.add_volume_bind(path, mount_point, readonly)

    if 'constraints' in service.description:
        for constraint in service.description['constraints']:
            copts.add_constraint(constraint)

    fswk = ZoeFSWorkspace()
    if fswk.can_be_attached():
        copts.add_volume_bind(fswk.get_path(execution.user_id),
                              fswk.get_mountpoint(), False)

    # The same dictionary is used for templates in the command
    if 'command' in service.description:
        copts.set_command(
            service.description['command'].format(**env_subst_dict))

    try:
        swarm = SwarmClient(get_conf())
    except Exception as e:
        raise ZoeStartExecutionFatalException(str(e))

    try:
        cont_info = swarm.spawn_container(service.description['docker_image'],
                                          copts)
    except ZoeException as e:
        raise ZoeStartExecutionRetryException(str(e))
    except ZoeLibException as e:
        raise ZoeStartExecutionRetryException(str(e))

    service.set_active(cont_info["docker_id"])

    if 'networks' in service.description:
        for net in service.description['networks']:
            try:
                swarm.connect_to_network(service.docker_id, net)
            except ZoeException as e:
                raise ZoeStartExecutionFatalException(str(e))

    return
Пример #21
0
    def __init__(self):
        super().__init__(name='stats', daemon=True)
        self.swarm = SwarmClient(get_conf())

        self._swarm_stats = None
Пример #22
0
 def __init__(self, sched_policy_class):
     self.swarm = SwarmClient(get_conf())
     self.scheduler = ZoeScheduler(self, sched_policy_class)
     self.state_manager = None
Пример #23
0
class PlatformManager:
    """
    :type swarm: SwarmClient
    :type scheduler: ZoeScheduler
    :type state_manager: StateManager
    """
    def __init__(self, sched_policy_class):
        self.swarm = SwarmClient(get_conf())
        self.scheduler = ZoeScheduler(self, sched_policy_class)
        self.state_manager = None

    def execution_submitted(self, execution: execution_module.Execution):
        execution.set_scheduled()
        self.scheduler.incoming(execution)

    def execution_start(self, execution: execution_module.Execution) -> bool:
        try:
            self._application_to_containers(execution)
        except ZoeException:
            self.execution_terminate(execution, reason='error')
            raise
        execution.set_started()
        self.state_manager.state_updated()

    def _application_to_containers(self, execution: execution_module.Execution):
        for process in execution.application.processes:
            self._spawn_process(execution, process)

    def _spawn_process(self, execution: execution_module.Execution, process_description: application_module.Process) -> bool:
        copts = ContainerOptions()
        copts.name = get_conf().container_name_prefix + '-' + process_description.name + "-{}".format(execution.id)
        copts.set_memory_limit(process_description.required_resources['memory'])
        copts.network_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, execution.owner.id)
        container_id = self.state_manager.gen_id()
        copts.labels = {
            'zoe.{}'.format(get_conf().container_name_prefix): '',
            'zoe.execution.id': str(execution.id),
            'zoe.execution.name': execution.name,
            'zoe.container.id': str(container_id),
            'zoe.container.name': process_description.name,
            'zoe.owner': execution.owner.name,
            'zoe.prefix': get_conf().container_name_prefix,
            'zoe.type': 'app_process'
        }
        if process_description.monitor:
            copts.labels['zoe.monitor'] = ''
        else:
            copts.labels['zoe.normal'] = ''
        copts.restart = not process_description.monitor  # Monitor containers should not restart

        # Generate a dictionary containing the current cluster status (before the new container is spawned)
        # This information is used to substitute template strings in the environment variables
        subst_dict = {
            "execution_id": str(execution.id),
            "user_id": str(execution.owner.id),
            'user_name': execution.owner.name,
            'name_prefix': get_conf().container_name_prefix
        }
        for env_name, env_value in process_description.environment:
            try:
                env_value = env_value.format(**subst_dict)
            except KeyError:
                raise ZoeException("cannot find variable to substitute in expression {}".format(env_value))
            copts.add_env_variable(env_name, env_value)

        # The same dictionary is used for templates in the command
        if process_description.command is not None:
            copts.set_command(process_description.command.format(**subst_dict))

        cont_info = self.swarm.spawn_container(process_description.docker_image, copts)
        container = container_module.Container(self.state_manager)
        container.docker_id = cont_info["docker_id"]
        container.ip_address = cont_info["ip_address"]
        container.name = copts.name
        container.is_monitor = process_description.monitor
        container.ports = [p.to_dict() for p in process_description.ports]

        container.id = container_id
        execution.containers.append(container)
        container.execution = execution

        self.swarm.connect_to_network(container.docker_id, 'eeef9754c16790a29d5210c5d9ad8e66614ee8a6229b6dc6f779019d46cec792')

        self.state_manager.new('container', container)
        return True

    def execution_terminate(self, execution: execution_module.Execution, reason):
        """
        :param execution: The execution to be terminated
        :param reason: termination reason
        :return:
        """
        logs = []
        if len(execution.containers) > 0:
            containers = execution.containers.copy()
            for c in containers:
                assert isinstance(c, container_module.Container)
                l = self.log_get(c.id)
                if l is not None:
                    logs.append((c.name, l))
                self.swarm.terminate_container(c.docker_id, delete=True)
                self.state_manager.delete('container', c.id)
                log.info('Container {} terminated'.format(c.name))
            execution.store_logs(logs)

        if reason == 'error':
            execution.set_error()
        elif reason == 'finished':
            execution.set_finished()
        else:
            execution.set_terminated()
        self.scheduler.execution_terminate(execution)

    def start_gateway_container(self, user):
        copts = ContainerOptions()
        copts.name = '{}-gateway-{}'.format(get_conf().container_name_prefix, user.id)
        copts.network_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, user.id)
        copts.ports.append(1080)
        copts.labels = {
            'zoe.{}.gateway'.format(get_conf().container_name_prefix): '',
            'zoe.owner': user.name,
            'zoe.prefix': get_conf().container_name_prefix,
            'zoe.type': 'gateway'
        }
        copts.restart = True
        if user.role == 'guest':
            image = get_conf().private_registry + '/zoerepo/guest-gateway'
        else:
            image = get_conf().private_registry + '/zoerepo/guest-gateway'  # TODO: create an image with ssh
        cont_info = self.swarm.spawn_container(image, copts)
        if cont_info is None:
            raise ZoeException('Cannot create user gateway container')
        user.gateway_docker_id = cont_info['docker_id']
        user.set_gateway_urls(cont_info)
        self.swarm.connect_to_network(user.gateway_docker_id, 'eeef9754c16790a29d5210c5d9ad8e66614ee8a6229b6dc6f779019d46cec792')

    def kill_gateway_container(self, user):
        self.swarm.terminate_container(user.gateway_docker_id, delete=True)
        user.gateway_docker_id = None
        user.gateway_urls = []

    def create_user_network(self, user):
        log.info('Creating a new network for user {}'.format(user.id))
        net_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, user.id)
        net_id = self.swarm.network_create(net_name)
        user.network_id = net_id

    def remove_user_network(self, user):
        log.info('Removing network for user {}'.format(user.name))
        self.swarm.network_remove(user.network_id)

    def log_get(self, container_id: int) -> str:
        container = self.state_manager.get_one('container', id=container_id)
        if container is None:
            return ''
        else:
            return self.swarm.log_get(container.docker_id)

    def container_stats(self, container_id: int) -> ContainerStats:
        container = self.state_manager.get_one('container', id=container_id)
        return self.swarm.stats(container.docker_id)

    def is_container_alive(self, container: container_module.Container) -> bool:
        ret = self.swarm.inspect_container(container.docker_id)
        if ret is None:
            return False
        return ret["running"]

    def swarm_stats(self) -> SwarmStats:
        # TODO implement some caching
        return self.swarm.info()

    def scheduler_stats(self) -> SchedulerStats:
        return self.scheduler.scheduler_policy.stats()

    def check_state_swarm_consistency(self):
        state_changed = False
        users = self.state_manager.get('user')
        networks = self.swarm.network_list('{}-usernet-'.format(get_conf().container_name_prefix))
        gateways = self.swarm.list(['zoe.{}.gateway'.format(get_conf().container_name_prefix)])

        users_no_network = []
        users_no_gateway = []
        networks_to_delete = []
        gateways_to_delete = []

        for u in users:
            if u.network_id is None:
                log.error('state inconsistency: user {} has no network'.format(u.name))
                users_no_network.append(u)
            elif u.network_id not in [x['id'] for x in networks]:
                log.error('state inconsistency: user {} has an invalid network'.format(u.name))
                u.network_id = None
                users_no_network.append(u)

            if u.gateway_docker_id is None:
                log.error('state inconsistency: user {} has no gateway'.format(u.name))
                users_no_gateway.append(u)
            elif u.gateway_docker_id not in [x['id'] for x in gateways]:
                log.error('state inconsistency: user {} has an invalid gateway container ID'.format(u.name))
                u.gateway_docker_id = None
                users_no_gateway.append(u)

        duplicate_check = set()
        for n in networks:
            try:
                uid = int(n['name'][len('{}-usernet-'.format(get_conf().container_name_prefix)):])
            except ValueError:
                log.error('network {} does not belong to Zoe, bug?'.format(n['name']))
                networks_to_delete.append(n['id'])
                continue
            if uid in duplicate_check:
                log.warning('state inconsistency: found two networks for the same user')
                networks_to_delete.append(n['id'])
                continue
            duplicate_check.add(uid)
            user = self.state_manager.get_one('user', id=uid)
            if user is not None and user in users_no_network:
                user.network_id = n['id']
                users_no_network.remove(user)
                log.error('fixed: user {} linked to network {}'.format(user.name, n['name']))
                state_changed = True
                continue
            elif user is None:
                log.error('state inconsistency: found a network for user {} who no longer exists'.format(uid))
                networks_to_delete.append(n['id'])

        for g in gateways:
            try:
                uid = int(g['name'][len('{}-gateway-'.format(get_conf().container_name_prefix)):])
            except ValueError:
                log.error('container {} does not belong to Zoe, bug?'.format(g['name']))
                gateways_to_delete.append(g['id'])
                continue
            user = self.state_manager.get_one('user', id=uid)
            if user is not None and user in users_no_gateway:
                user.gateway_docker_id = g['id']
                users_no_gateway.remove(user)
                cont_info = self.swarm.inspect_container(g['id'])
                user.set_gateway_urls(cont_info)
                log.error('fixed: user {} linked to gateway {}'.format(user.name, g['name']))
                state_changed = True
                continue
            elif user is None:
                log.error('state inconsistency: found a gateway for user {} who no longer exists'.format(uid))
                gateways_to_delete.append(g['id'])

        # Fix all inconsistencies found
        for g in gateways_to_delete:
            log.error('fixed: terminating orphan gateway container {}'.format(g[:8]))
            self.swarm.terminate_container(g, delete=True)
        for n in networks_to_delete:
            log.error('fixed: terminating orphan network {}'.format(n[:8]))
            self.swarm.network_remove(n)

        for u in users_no_network:
            log.error('fixed: creating network for user {}'.format(u.name))
            self.create_user_network(u)
        for u in users_no_gateway:
            log.error('fixed: creating gateway for user {}'.format(u.name))
            self.start_gateway_container(u)

        # ### Check executions and container consistency
        swarm_containers = self.swarm.list(only_label='zoe.{}'.format(get_conf().container_name_prefix))
        conts_state_to_delete = []
        for c_id, c in self.state_manager.containers.items():
            if c.docker_id not in [x['id'] for x in swarm_containers]:
                log.error('fixed: removing from state container {} that does not exist in Swarm'.format(c.name))
                conts_state_to_delete.append(c_id)
        for c_id in conts_state_to_delete:
            self.state_manager.delete('container', c_id)

        if state_changed or len(users_no_gateway) > 0 or len(users_no_network) > 0:
            self.state_manager.state_updated()
Пример #24
0
def _spawn_service(execution: Execution, service: Service, env_subst_dict: dict):
    copts = DockerContainerOptions()
    copts.gelf_log_address = get_conf().gelf_address
    copts.name = service.dns_name
    copts.set_memory_limit(service.description['required_resources']['memory'])
    copts.network_name = get_conf().overlay_network_name
    copts.labels = {
        'zoe.execution.name': execution.name,
        'zoe.execution.id': str(execution.id),
        'zoe.service.name': service.name,
        'zoe.service.id': str(service.id),
        'zoe.owner': execution.user_id,
        'zoe.deployment_name': get_conf().deployment_name,
        'zoe.type': 'app_service'
    }
    if service.description['monitor']:
        copts.labels['zoe.monitor'] = 'true'
    else:
        copts.labels['zoe.monitor'] = 'false'
    copts.restart = not service.description['monitor']  # Monitor containers should not restart

    # Generate a dictionary containing the current cluster status (before the new container is spawned)
    # This information is used to substitute template strings in the environment variables
    for env_name, env_value in service.description['environment']:
        try:
            env_value = env_value.format(**env_subst_dict)
        except KeyError:
            raise ZoeStartExecutionFatalException("unknown variable in expression {}".format(env_value))
        copts.add_env_variable(env_name, env_value)

    for p in service.description['ports']:
        if p['expose']:
            copts.ports.append(p['port_number'])  # FIXME UDP ports?

    if 'volumes' in service.description:
        for path, mount_point, readonly in service.description['volumes']:
            copts.add_volume_bind(path, mount_point, readonly)

    for wks in singletons['workspace_managers']:
        assert isinstance(wks, zoe_master.workspace.base.ZoeWorkspaceBase)
        if wks.can_be_attached():
            copts.add_volume_bind(wks.get_path(execution.user_id), wks.get_mountpoint(), False)

    # The same dictionary is used for templates in the command
    if 'command' in service.description:
        copts.set_command(service.description['command'].format(**env_subst_dict))

    try:
        swarm = SwarmClient(get_conf())
    except Exception as e:
        raise ZoeStartExecutionFatalException(str(e))

    try:
        cont_info = swarm.spawn_container(service.description['docker_image'], copts)
    except ZoeException as e:
        raise ZoeStartExecutionRetryException(str(e))

    service.set_active(cont_info["docker_id"])

    if 'networks' in service.description:
        for net in service.description['networks']:
            try:
                swarm.connect_to_network(service.docker_id, net)
            except ZoeException as e:
                raise ZoeStartExecutionFatalException(str(e))

    return