def virtual_clusters(self, user_info: dict = None): user_info = na(user_info, self.rest_api_user()) assert user_info, f'failed to get user information from {self.alias}' my_virtual_clusters = user_info["virtualCluster"] if isinstance(my_virtual_clusters, str): my_virtual_clusters = my_virtual_clusters.split(",") return my_virtual_clusters
def single_task_logs(status: dict, task_role: str = 'main', index: int = 0, log_type: dict = None, return_urls: bool = False): """change to use containerLog""" log_type = na( log_type, { "stdout": "user.pai.stdout/?start=0", "stderr": "user.pai.stderr/?start=0" }) containers = status.get("taskRoles", {}).get(task_role, {}).get("taskStatuses", []) if len(containers) < index + 1: return None containerLog = containers[index].get("containerLog", None) if not containerLog: return None urls = {k: "{}{}".format(containerLog, v) for k, v in log_type.items()} if return_urls: return urls else: html_contents = { k: get_response('GET', v).text for k, v in urls.items() } try: from html2text import html2text return {k: html2text(v) for k, v in html_contents.items()} except ImportError: return html_contents
def submit(self, cluster_alias: str = None, virtual_cluster: str = None): cluster_alias = na(cluster_alias, self.param("cluster_alias", None)) self.select_cluster(cluster_alias, virtual_cluster) self.validate().local_process() to_screen("submit job %s to cluster %s" % (self.name, cluster_alias)) self.client.rest_api_submit(self.get_config()) job_link = self.client.get_job_link(self.name) return {"job_link": job_link, "job_name": self.name}
def get_logs_url(self, status: dict = None, task_role: str = 'main', index: int = 0, logs: dict = None): """change to use containerLog""" logs = na(logs, { "stdout": "user.pai.stdout", "stderr": "user.pai.stderr" }) status = na(status, self.status()) containers = status.get("taskRoles", {}).get(task_role, {}).get("taskStatuses", []) if len(containers) < index + 1: return None containerLog = containers[index].get("containerLog", None) if not containerLog: return None return {k: "{}{}".format(containerLog, v) for k, v in logs.items()}
def logs(self, status: dict = None, task_role: str = 'main', index: int = 0, logs: dict = None): status = na(status, self.status()) urls = self.get_logs_url(status, task_role, index, logs) if not urls: return None return { k: html2text(get_response(v, method='GET').text) for k, v in urls.items() }
def connect_jupyter_batch(self, status: dict = None): "fetch the html result if ready" status = na(status, self.status()) state = self.state(status) url = None if state in __job_states__["successful"]: html_file = self.param("notebook_file") + ".html" local_path = html_file remote_path = '{}/output/{}'.format(self.param("work_directory"), html_file) self.client.get_storage().download(remote_path=remote_path, local_path=local_path) url = pathlib.Path(os.path.abspath(html_file)).as_uri() return dict(state=state, notebook=url)
def do_action_notebook(self, args): self.__job__.new(args.job_name).from_notebook( nb_file=args.notebook, mode="interactive" if args.interactive else "silent", token=args.token, image=args.image, cluster=extract_args( args, ["cluster_alias", "virtual_cluster", "workspace"]), resources=extract_args(args, ["gpu", "cpu", "memoryMB", "mem"]), sources=args.sources, pip_installs=args.pip_installs, ) self.__job__.protocol["parameters"]["python_path"] = args.python result = self.submit_it(args) if not args.preview: result.update(na(self.connect_notebook(), {})) return result
def submit(self, cluster_alias: str = None, virtual_cluster: str = None): cluster_alias = na(cluster_alias, self.param("cluster_alias", None)) self.select_cluster(cluster_alias, virtual_cluster) self.validate().local_process() to_screen("submit job %s to cluster %s" % (self.name, cluster_alias)) try: self.client.rest_api_submit(self.get_config()) job_link = self.client.get_job_link(self.name) return {"job_link": job_link, "job_name": self.name} except Exception as identifier: to_screen(f"submit failed due to {repr(identifier)}", _type="error") to_screen(self.get_config()) raise identifier
def __init__(self, name: str, include: list = None, exclude: list = None, file: str = None, values: dict = None, allow_unknown: bool = True): self.name = name self.file = file self.values = from_file(file, {}, silent=True) if file else na( values, {}) self.definitions = OrganizedList(__flags__.default_var_definitions(), _key="name").filter( None, include, exclude) # type: OrganizedList
def get_storage(self, alias: str = None): # ! every cluster should have a builtin storage storage_cfg = self.config.get("storages", {}).get(na(alias, "builtin"), None) assert storage_cfg, alias if storage_cfg["protocol"] == "hdfs": uri = storage_cfg.get("uri", self.pai_uri).strip("/") if self.config.get("pylon_enabled", True): uri += "/webhdfs" else: uri += ":%d" % storage_cfg.get("ports", {}).get( "webhdfs", 50070) return Storage(protocol='webHDFS', url=uri, user=storage_cfg.get('user', self.user)) raise NotImplementedError
def check(self): to_screen("try to connect cluster {}".format(self.alias)) storages = self.rest_api_storages() for i, s in enumerate(storages): s.setdefault("storage_alias", s["protocol"] + f'-{i}') cluster_info = na(self.rest_api_cluster_info(), {}) if cluster_info.get("authnMethod", "basic") == "OIDC": assert self.config[ "token"], "must use authentication token (instead of password) in OIDC mode" self.config.update( info=cluster_info, storages=storages, virtual_clusters=self.virtual_clusters(), ) # ! will check authentication types according to AAD enabled or not return self
def load(self, fname: str = None, job_name: str = None, cluster_alias: str = None): if cluster_alias: # load job config from cluster by REST api job_name = na(job_name, self.name) self.protocol = get_cluster(cluster_alias).rest_api_job_info( job_name, 'config') else: # load from local file if not fname: fname = Job(job_name).protocol_file if os.path.isfile(fname): self.protocol = from_file(fname, default="==FATAL==") self.protocol.setdefault( 'protocolVersion', '1') # v1 protocol (json) has no protocolVersion return self
def connect_jupyter_interactive(self, status: dict = None): "get the url of notebook if ready" status = na(status, self.status()) state = self.state(status) url = None if state == "RUNNING": log_url = self.get_logs_url(status)["stderr"] job_log = html2text(get_response(log_url, method='GET').text).split('\n') for line in job_log: if re.search("The Jupyter Notebook is running at:", line): container = status["taskRoles"]["main"]["taskStatuses"][0] url = "http://{}:{}/notebooks/{}".format( container["containerIp"], container["containerPorts"]["jupyter"], self.param("notebook_file") + ".ipynb", ) break return dict(state=state, notebook=url)
def load(self, fname: str = None): fname = na(fname, self.default_config_file) self.clusters = OrganizedList(from_file(fname, default=[]), _key="cluster_alias") return self
def from_notebook(self, nb_file: str, mode: str = "interactive", token: str = "abcd", image: str = None, cluster: dict = None, resources: dict = None, sources: list = None, pip_installs: list = None): """ mode: interactive / silent / script """ assert mode in ["interactive", "silent", "script"], "unsupported mode %s" % mode if not nb_file: mode, nb_file = "interactive", "" else: assert os.path.isfile( nb_file), "cannot read the ipython notebook {}".format(nb_file) sources = na(sources, []) sources.append(nb_file) self.set_param( "notebook_file", os.path.splitext(os.path.basename(nb_file))[0] if nb_file else "") resources = JobResource(resources) if mode == "interactive": resources.add_port("jupyter") self.set_secret("token", token) cmds = [ " ".join([ "jupyter notebook", "--no-browser", "--ip 0.0.0.0", "--port $PAI_CONTAINER_HOST_jupyter_PORT_LIST", "--NotebookApp.token=<% $secrets.token %>", "--allow-root --NotebookApp.file_to_run=<% $parameters.notebook_file %>.ipynb", ]), ] elif mode == "silent": cmds = [ " ".join([ "jupyter nbconvert --ExecutePreprocessor.timeout=-1 --ExecutePreprocessor.allow_errors=True", "--to html --execute <% $parameters.notebook_file %>.ipynb", ]), "opai storage upload <% $parameters.notebook_file %>.html <% $parameters.work_directory %>/output/<% $parameters.notebook_file %>.html", ] else: cmds = [ "jupyter nbconvert --to script <% $parameters.notebook_file %>.ipynb --output openpai_submitter_entry", "echo ======================== Python Script Starts ========================", # execute notebook by iPython. To remove color information, we use "--no-term-title" and sed below """ipython --no-term-title openpai_submitter_entry.py | sed -r "s/\\x1B\\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" | tr -dc '[[:print:]]\\n'""", ] self.one_liner(cmds, image, cluster, resources.as_dict, sources, na(pip_installs, []) + ["jupyter"]) mode_to_tag = { "interactive": "interactive_nb", "silent": "batch_nb", "script": "script_nb" } self.add_tag(__internal_tags__[mode_to_tag[mode]]) return self
def sdk_job_template(self, cluster_alias_lst: str = [], workspace: str = None, sources: list = None, pip_installs: list = None): "generate the job template for a sdk-submitted job" # secrets clusters = [ get_cluster(alias, get_client=False) for alias in cluster_alias_lst ] workspace = na(workspace, LayeredSettings.get("workspace")) workspace = na(workspace, f"{__flags__.storage_root}/{clusters[0]['user']}") self.set_secret("clusters", json.dumps(clusters)) self.set_param("cluster_alias", cluster_alias_lst[0] if cluster_alias_lst else None) self.set_param( "work_directory", '{}/jobs/{}'.format(workspace, self.name) if workspace else None) # parameters self.set_param("python_path", "python") # signature self.add_tag(__internal_tags__["sdk"]) # sdk.plugins sdk_install_uri = "-U {}".format(get_install_uri()) c_dir = '~/{}'.format(__flags__.cache) c_file = '%s/%s' % (c_dir, __flags__.cluster_cfg_file) plugins = [] if sources: plugins.append({ "plugin": "local.uploadFiles", "parameters": { "files": list(set([os.path.relpath(s) for s in sources])), }, }) plugins.extend([ { "plugin": "container.preCommands", # commands to install essential pip packages "parameters": { "commands": [ "<% $parameters.python_path %> -m pip install {}". format(p) for p in [sdk_install_uri] + na(pip_installs, []) ] } }, { "plugin": "container.preCommands", # copy cluster information "parameters": { "commands": [ "mkdir %s" % c_dir, "echo \"write config to {}\"".format(c_file), "echo <% $secrets.clusters %> > {}".format(c_file), "opai cluster select <% $parameters.cluster_alias %>", ] } } ]) if sources: a_file = os.path.basename(self.temp_archive) plugins.append({ "plugin": "container.preCommands", "parameters": { "commands": [ "opai storage download <% $parameters.work_directory %>/source/{} {}" .format(a_file, a_file), "tar xvfz {}".format(a_file) ] } }) self.set_extra("sdk.plugins", plugins) return self
def state(self, status: dict = None): status = na(status, self.status()) return status.get("jobStatus", {}).get("state", None)
def virtual_clusters(self, user_info: dict = None): user_info = na(user_info, self.rest_api_user()) my_virtual_clusters = user_info["virtualCluster"] if isinstance(my_virtual_clusters, str): my_virtual_clusters = my_virtual_clusters.split(",") return my_virtual_clusters