class MarsCupidClient(object):
    def __init__(self, odps, inst=None, project=None):
        self._odps = odps
        self._cupid_session = CupidSession(odps, project=project)
        self._kube_instance = inst
        self._kube_url = None
        self._kube_client = None
        self._kube_namespace = None

        self._scheduler_key = None
        self._scheduler_config = None
        self._worker_config = None
        self._web_config = None
        self._endpoint = None
        self._with_notebook = False
        self._notebook_endpoint = None

        self._mars_session = None
        self._req_session = None

    @property
    def endpoint(self):
        return self._endpoint

    @property
    def notebook_endpoint(self):
        return self._notebook_endpoint

    @property
    def session(self):
        return self._mars_session

    @property
    def instance_id(self):
        return self._kube_instance.id

    def submit(self, image=None, scheduler_num=1, scheduler_cpu=8, scheduler_mem=32 * 1024 ** 3,
               worker_num=1, worker_cpu=8, worker_mem=32 * 1024 ** 3, worker_cache_mem=None,
               min_worker_num=None, worker_disk_num=1, worker_disk_size=100 * 1024 ** 3,
               web_num=1, web_cpu=1, web_mem=1024 ** 3, with_notebook=False, notebook_cpu=1,
               notebook_mem=2 * 1024 ** 3, timeout=None, extra_env=None, extra_modules=None,
               resources=None, create_session=True, priority=None, running_cluster=None,
               task_name=None, **kw):
        try:
            async_ = kw.pop('async_', None)

            # compatible with early version
            mars_image = kw.pop('mars_image', None)
            default_resources = kw.pop('default_resources', None) or DEFAULT_RESOURCES
            instance_idle_timeout = kw.pop('instance_idle_timeout', None)
            if with_notebook is not None:
                self._with_notebook = bool(with_notebook)
            else:
                self._with_notebook = options.mars.launch_notebook
            if self._kube_instance is None:
                image = image or mars_image or build_image_name('mars')

                extra_modules = extra_modules or []
                if isinstance(extra_modules, (tuple, list)):
                    extra_modules = list(extra_modules) + ['odps.mars_extension']
                else:
                    extra_modules = [extra_modules, 'odps.mars_extension']

                if resources is not None:
                    if isinstance(resources, (tuple, list)):
                        resources = list(resources)
                        resources.extend(default_resources)
                    else:
                        resources = [resources] + default_resources
                else:
                    resources = default_resources

                if worker_cache_mem is None:
                    worker_cache_mem = int(worker_mem * 0.48)
                else:
                    worker_cache_mem = worker_cache_mem

                cluster_args = dict(
                    image=image, scheduler_num=scheduler_num, scheduler_cpu=scheduler_cpu,
                    scheduler_mem=scheduler_mem, worker_num=worker_num, worker_cpu=worker_cpu,
                    worker_mem=worker_mem, worker_cache_mem=worker_cache_mem,
                    min_worker_num=min_worker_num, worker_disk_num=worker_disk_num,
                    worker_disk_size=worker_disk_size, web_num=web_num, web_cpu=web_cpu,
                    web_mem=web_mem, with_notebook=with_notebook, notebook_cpu=notebook_cpu,
                    notebook_mem=notebook_mem, extra_env=extra_env, extra_modules=extra_modules,
                    instance_idle_timeout=instance_idle_timeout, timeout=timeout)

                command = '/srv/entrypoint.sh %s %s' % (
                    __name__.rsplit('.', 1)[0] + '.app',
                    base64.b64encode(json.dumps(cluster_args).encode()).decode()
                )

                self._kube_instance = self._cupid_session.start_kubernetes(
                    async_=True, running_cluster=running_cluster, priority=priority,
                    app_image=build_image_name('mars'), app_command=command, resources=resources,
                    task_name=task_name, **kw)
                write_log(self._kube_instance.get_logview_address())
            if async_:
                return self
            else:
                self.wait_for_success(create_session=create_session, min_worker_num=min_worker_num or worker_num)
                return self

        except KeyboardInterrupt:
            self.stop_server()
            return self

    def check_service_ready(self, timeout=1):
        try:
            resp = self._req_session.get(self._endpoint + '/api', timeout=timeout)
        except (requests.ConnectionError, requests.Timeout):
            return False
        if resp.status_code >= 400:
            return False
        return True

    def count_workers(self):
        resp = self._req_session.get(self._endpoint + '/api/worker?action=count', timeout=1)
        return json.loads(resp.text)

    def get_logview_address(self):
        return self._kube_instance.get_logview_address()

    def get_mars_endpoint(self):
        return self._cupid_session.get_proxied_url(self._kube_instance.id, CUPID_APP_NAME)

    def get_notebook_endpoint(self):
        return self._cupid_session.get_proxied_url(self._kube_instance.id, NOTEBOOK_NAME)

    def get_req_session(self):
        from ...rest import RestClient

        if options.mars.use_common_proxy:
            return RestClient(self._odps.account, self._endpoint, self._odps.project)
        else:
            return requests.Session()

    def check_instance_status(self):
        if self._kube_instance.is_terminated():
            for task_name, task in (self._kube_instance.get_task_statuses()).items():
                exc = None
                if task.status == Instance.Task.TaskStatus.FAILED:
                    exc = errors.parse_instance_error(self._kube_instance.get_task_result(task_name))
                elif task.status != Instance.Task.TaskStatus.SUCCESS:
                    exc = errors.ODPSError('%s, status=%s' % (task_name, task.status.value))
                if exc:
                    exc.instance_id = self._kube_instance.id
                    raise exc

    def wait_for_success(self, min_worker_num=0, create_session=True):
        while True:
            self.check_instance_status()
            try:
                if self._endpoint is None:
                    self._endpoint = self.get_mars_endpoint()
                    write_log('Mars UI: ' + self._endpoint)
                    self._req_session = self.get_req_session()

                    self._req_session.post(self._endpoint.rstrip('/') + '/api/logger', data=dict(
                        content='Mars UI from client: ' + self._endpoint
                    ))
                if self._with_notebook and self._notebook_endpoint is None:
                    self._notebook_endpoint = self.get_notebook_endpoint()
                    write_log('Notebook UI: ' + self._notebook_endpoint)

                    self._req_session.post(self._endpoint.rstrip('/') + '/api/logger', data=dict(
                        content='Notebook UI from client: ' + self._notebook_endpoint
                    ))
            except KeyboardInterrupt:
                raise
            except:
                time.sleep(1)
                continue

            if not self.check_service_ready():
                continue
            try:
                if self.count_workers() >= min_worker_num:
                    break
                else:
                    time.sleep(1)
            except:
                continue

        if create_session:
            try:
                self._mars_session = new_session(self._endpoint, req_session=self._req_session).as_default()
            except KeyboardInterrupt:
                raise
            except:
                if self._kube_instance and self._kube_instance.status == self._kube_instance.Status.RUNNING:
                    self._kube_instance.stop()
                raise

    def restart_session(self):
        self._mars_session.close()
        self._mars_session = new_session(self._endpoint, req_session=self._req_session).as_default()

    def stop_server(self):
        if self._kube_instance:
            self._kube_instance.stop()
            self._kube_instance = None
Пример #2
0
class MarsCupidClient(object):
    def __init__(self, odps, inst=None):
        self._odps = odps
        self._cupid_session = CupidSession(odps)
        self._kube_instance = inst
        self._kube_url = None
        self._kube_client = None
        self._kube_namespace = None

        self._scheduler_key = None
        self._scheduler_config = None
        self._worker_config = None
        self._web_config = None
        self._endpoint = None
        self._has_notebook = False
        self._notebook_endpoint = None

        self._mars_session = None
        self._req_session = None

    @property
    def endpoint(self):
        return self._endpoint

    @property
    def notebook_endpoint(self):
        return self._notebook_endpoint

    @property
    def session(self):
        return self._mars_session

    @property
    def instance_id(self):
        return self._kube_instance.id

    def submit(self,
               worker_num=1,
               worker_cpu=8,
               worker_mem=32,
               disk_num=1,
               min_worker_num=None,
               cache_mem=None,
               resources=None,
               module_path=None,
               create_session=True,
               priority=None,
               running_cluster=None,
               scheduler_num=1,
               notebook=None,
               **kw):
        try:
            async_ = kw.pop('async_', None)
            default_resources = kw.pop('default_resources',
                                       None) or DEFAULT_RESOURCES
            if notebook is not None:
                self._has_notebook = bool(notebook)
            else:
                self._has_notebook = options.mars.launch_notebook
            if self._kube_instance is None:
                if module_path is not None:
                    if isinstance(module_path, (tuple, list)):
                        module_path = list(module_path) + [
                            'odps.mars_extension'
                        ]
                    else:
                        module_path = [module_path, 'odps.mars_extension']

                if resources is not None:
                    if isinstance(resources, (tuple, list)):
                        resources = list(resources)
                        resources.extend(default_resources)
                    else:
                        resources = [resources] + default_resources
                else:
                    resources = default_resources

                if cache_mem is None:
                    cache_mem = str(worker_mem * 0.48) + 'G'
                else:
                    cache_mem = str(cache_mem) + 'G'
                mars_config = {
                    'scheduler_num': scheduler_num,
                    'worker_num': worker_num,
                    'worker_cpu': worker_cpu,
                    'worker_mem': worker_mem,
                    'cache_mem': cache_mem or '',
                    'disk_num': disk_num,
                    'resources': resources,
                    'module_path': module_path or ['odps.mars_extension'],
                }
                if 'mars_app_image' in kw:
                    mars_config['mars_app_image'] = kw.pop('mars_app_image')
                if 'mars_image' in kw:
                    mars_config['mars_image'] = kw.pop('mars_image')
                if 'proxy_endpoint' in kw:
                    mars_config['proxy_endpoint'] = kw.pop('proxy_endpoint')
                if 'major_task_version' in kw:
                    mars_config['major_task_version'] = kw.pop(
                        'major_task_version')
                mars_config['scheduler_mem'] = kw.pop('scheduler_mem', 32)
                mars_config['scheduler_cpu'] = kw.pop('scheduler_cpu', 8)

                if self._has_notebook:
                    mars_config['notebook'] = True
                self._kube_instance = self._cupid_session.start_kubernetes(
                    async_=True,
                    running_cluster=running_cluster,
                    priority=priority,
                    app_name='mars',
                    app_config=mars_config,
                    **kw)
                write_log(self._kube_instance.get_logview_address())
            if async_:
                return self
            else:
                self.wait_for_success(create_session=create_session,
                                      min_worker_num=min_worker_num
                                      or worker_num)
                return self

        except KeyboardInterrupt:
            self.stop_server()
            return self

    def check_service_ready(self, timeout=1):
        try:
            resp = self._req_session.get(self._endpoint + '/api',
                                         timeout=timeout)
        except (requests.ConnectionError, requests.Timeout):
            return False
        if resp.status_code >= 400:
            return False
        return True

    def count_workers(self):
        resp = self._req_session.get(self._endpoint +
                                     '/api/worker?action=count',
                                     timeout=1)
        return json.loads(resp.text)

    def get_logview_address(self):
        return self._kube_instance.get_logview_address()

    def get_mars_endpoint(self):
        return self._cupid_session.get_proxied_url(self._kube_instance.id,
                                                   CUPID_APP_NAME)

    def get_notebook_endpoint(self):
        return self._cupid_session.get_proxied_url(self._kube_instance.id,
                                                   NOTEBOOK_NAME)

    def get_req_session(self):
        from ...rest import RestClient

        if options.mars.use_common_proxy:
            return RestClient(self._odps.account, self._endpoint,
                              self._odps.project)
        else:
            return requests.Session()

    def check_instance_status(self):
        if self._kube_instance.is_terminated():
            for task_name, task in (
                    self._kube_instance.get_task_statuses()).items():
                exc = None
                if task.status == Instance.Task.TaskStatus.FAILED:
                    exc = errors.parse_instance_error(
                        self._kube_instance.get_task_result(task_name))
                elif task.status != Instance.Task.TaskStatus.SUCCESS:
                    exc = errors.ODPSError('%s, status=%s' %
                                           (task_name, task.status.value))
                if exc:
                    exc.instance_id = self._kube_instance.id
                    raise exc

    def wait_for_success(self, min_worker_num=0, create_session=True):
        while True:
            self.check_instance_status()
            try:
                if self._endpoint is None:
                    self._endpoint = self.get_mars_endpoint()
                    write_log('Mars UI: ' + self._endpoint)
                    self._req_session = self.get_req_session()
                if self._has_notebook and self._notebook_endpoint is None:
                    self._notebook_endpoint = self.get_notebook_endpoint()
                    write_log('Notebook UI: ' + self._notebook_endpoint)
            except KeyboardInterrupt:
                raise
            except:
                time.sleep(1)
                continue

            if not self.check_service_ready():
                continue
            try:
                if self.count_workers() >= min_worker_num:
                    break
                else:
                    time.sleep(1)
            except:
                continue

        if create_session:
            try:
                self._mars_session = new_session(
                    self._endpoint,
                    req_session=self._req_session).as_default()
            except KeyboardInterrupt:
                raise
            except:
                if self._kube_instance and self._kube_instance.status == self._kube_instance.Status.RUNNING:
                    self._kube_instance.stop()
                raise

    def stop_server(self):
        if self._kube_instance:
            self._kube_instance.stop()
            self._kube_instance = None