def step3_job_monitoring(self): to_screen("""\ testing REST APIs related to querying a job, including - rest_api_job_list - rest_api_job_info """) client = ClusterList().load().get_client( get_defaults()["cluster-alias"]) self.cmd_exec(['opai', 'job', 'list']) job_list = client.rest_api_job_list( client.user) # ! only jobs from current user to reduce time job_list = [job['name'] for job in job_list] assert self.job_name in job_list, job_list to_screen(f"testing job monitoring with {self.job_name}") status = client.rest_api_job_info(self.job_name) to_screen( f"retrieving job status and get its state {JobStatusParser.state(status)}" ) client.rest_api_job_info(self.job_name, 'config') to_screen("retrieving job config") logs = JobStatusParser.all_tasks_logs(status) assert logs, f"failed to read logs from status \n{status}" for k, v in logs.items(): for t, content in v.items(): to_screen( f"reading logs {k} for {t} and get {len(content)} Bytes")
def check(self): cluster_info = self.rest_api_cluster_info() cluster_info["virtual_clusters"] = self.virtual_clusters() self.config.update(cluster_info) # ! will check authentication types according to AAD enabled or not to_screen("succeeded to connect cluster {}".format(self.alias)) return self
def wait(self, t_sleep: float = 10, timeout: float = 3600, silent: bool = False): """for jupyter job, wait until ready to connect for normal job, wait until completed""" exit_states = __job_states__["completed"] repeater = Retry(timeout=timeout, t_sleep=t_sleep, silent=silent) interactive_nb = self.has_tag(__internal_tags__["interactive_nb"]) batch_nb = self.has_tag(__internal_tags__["batch_nb"]) if interactive_nb or batch_nb: if interactive_nb: to_screen( "{} is recognized to be an interactive jupyter notebook job" .format(self.name)) to_screen( "notebook job needs to be RUNNING state and the kernel started" ) if batch_nb: to_screen( "{} is recognized to be a silent jupyter notebook job". format(self.name)) to_screen( "notebook job needs to be SUCCEEDED state and the output is ready" ) return repeater.retry( lambda x: x.get('state', None) in exit_states or x.get( "notebook", None) is not None, self.connect_jupyter) to_screen("wait until job to be completed ({})".format(exit_states)) return repeater.retry( lambda x: JobStatusParser.state(x) in exit_states, # x: job status self.get_status)
def download_webhdfs(self, remote_path: str, local_path: str, **kwargs): mkdir_for(local_path) to_screen("download %s -> %s" % (remote_path, local_path)) return self.client.download(local_path=local_path, hdfs_path=remote_path, overwrite=True, **kwargs)
def run_steps(self): for name, func in self.get_steps(): try: to_screen(f"\n==== begin to test {name} ====") func() except Exception as identifier: self.fail("test {} failed ({}: {})".format( name, type(identifier), repr(identifier)))
def submit(self, cluster_alias: str = None, virtual_cluster: str = None): cluster_alias = na(cluster_alias, self.param("cluster_alias", None)) self.select_cluster(cluster_alias, virtual_cluster) self.validate().local_process() to_screen("submit job %s to cluster %s" % (self.name, cluster_alias)) self.client.rest_api_submit(self.get_config()) job_link = self.client.get_job_link(self.name) return {"job_link": job_link, "job_name": self.name}
def do_action_list(self, args): client = self.__clusters__.get_client(args.cluster_alias) if not args.user: args.user = client.user to_screen("if not set, only your job will be listed, user `--user __all__` to list jobs of all users") if args.user == '__all__': args.user = None jobs = client.rest_api_job_list(user=args.user) return ["%s [%s]" % (j["name"], j.get("state", "UNKNOWN")) for j in jobs]
def check_arguments_notebook(self, args): self.check_essentials(args) assert args.notebook or args.interactive, "must specify a notebook name unless in interactive mode" if not args.job_name: assert args.notebook or args.interactive, "must specify a notebook if no job name defined" args.job_name = os.path.splitext(os.path.basename(args.notebook))[ 0] + "_" + randstr().hex if args.notebook else "jupyter_server_{}".format(randstr().hex) if args.interactive and not args.token: to_screen("no authentication token is set", _type="warn")
def get_random_var_name(self): import random from openpaisdk import LayeredSettings lst = [ x for x in LayeredSettings.keys() if not LayeredSettings.act_append(x) ] ret = lst[random.randint(0, len(lst) - 1)] to_screen(f"random select {ret} in {lst}") return ret
def wrapper(*args, **kwargs): try: return fn(*args, **kwargs) except err_type as e: if not err_msg: to_screen(repr(e), _type="warn") else: to_screen(err_msg, _type="warn") return default except Exception as e: raise e
def remove(self, target): indexes = self.filter_index(target) if not indexes: to_screen( f"OrganizedList: {self._key} = {target} cannot be deleted due to non-existence" ) return self for index in sorted(indexes, reverse=True): del self[index] to_screen(f"OrganizedList: {self._key} = {target} removed") return self
def func(*args, **kwargs): dir_name = 'utdir_' + method.__name__ os.makedirs(dir_name, exist_ok=True) try: with safe_chdir(dir_name): method(*args, **kwargs) except Exception as identifier: raise identifier finally: to_screen(f"trying to remove {dir_name}") # ! rmtree not work on windows os.system(f'rm -rf {dir_name}')
def tabulate_resources(dic: dict): to_screen([[ c, i.get("uri", None), i.get("user", None), v, i["GPUs"], i["vCores"], i["memory"] ] for c in dic.keys() for v, i in dic[c].items()], _type="table", headers=[ "cluster", "uri", "user", "virtual-cluster", "GPUs", "vCores", "memory" ]) return dic
def step2_submit_job(self): import time to_screen("""\ testing REST APIs related to submitting a job, including - rest_api_submit """) self.job_name = 'ut_test_' + randstr(10) self.cmd_exec([ 'opai', 'job', 'sub', '-i', 'python:3', '-j', self.job_name, 'opai cluster resources' ]) time.sleep(10)
def step1_init_clusters(self): to_screen("""\ testing REST APIs related to retrieving cluster info, including - rest_api_cluster_info - rest_api_user - rest_api_token - rest_api_virtual_clusters """) with open(self.ut_init_shell) as fn: for line in fn: if line.startswith('#'): continue self.cmd_exec(line) alias = get_defaults()["cluster-alias"] self.assertTrue(alias, "not specify a cluster") self.cmd_exec('opai cluster resources')
def job_spider(cluster, jobs: list = None): jobs = na_lazy(jobs, cluster.rest_api_job_list) to_screen("{} jobs to be captured in the cluster {}".format( len(jobs), cluster.alias)) job_statuses = concurrent_map( lambda j: cluster.rest_api_job_info( j['name'], info=None, user=j['username']), jobs) job_configs = concurrent_map( lambda j: cluster.rest_api_job_info( j['name'], info='config', user=j['username']), jobs) job_logs = concurrent_map(JobStatusParser.all_tasks_logs, job_statuses) for job, sta, cfg, logs in zip(jobs, job_statuses, job_configs, job_logs): job['status'] = sta job['config'] = cfg job['logs'] = logs return jobs
def check(self): to_screen("try to connect cluster {}".format(self.alias)) storages = self.rest_api_storages() for i, s in enumerate(storages): s.setdefault("storage_alias", s["protocol"] + f'-{i}') cluster_info = na(self.rest_api_cluster_info(), {}) if cluster_info.get("authnMethod", "basic") == "OIDC": assert self.config[ "token"], "must use authentication token (instead of password) in OIDC mode" self.config.update( info=cluster_info, storages=storages, virtual_clusters=self.virtual_clusters(), ) # ! will check authentication types according to AAD enabled or not return self
def retry(self, f_exit, func, *args, **kwargs): t, i = 0, 0 while True: try: x = func(*args, **kwargs) if f_exit(x): if not self.silent: to_screen("ready: {}".format(x)) return x except NotReadyError as identifier: __logger__.debug("condition not satisfied", identifier) if not self.silent: to_screen("not ready yet: {}".format(x)) i, t = i + 1, t + self.t_sleep if self.max_try and i >= self.max_try or self.timeout and t >= self.timeout: return None if self.t_sleep: time.sleep(self.t_sleep)
def add(lst: list, key: str, elem: dict, getter=dict.get, silent: bool = False) -> bool: "return True if update an existing elements, else return False" target = getter(elem, key) m = OrganizedList.filter(lst, key, target) # type: dict, matches for x in m["matches"]: x.update(elem) if not silent: to_screen("%s = %s already exists, update it" % (key, elem[key])) return lst lst.append(elem) if not silent: to_screen("%s = %s added" % (key, elem[key])) return lst
def plugin_uploadFiles(self, plugin: dict): import tarfile to_screen("archiving and uploading ...") work_directory = self.param("work_directory") assert work_directory, "must specify a storage to upload" with safe_open(self.temp_archive, "w:gz", func=tarfile.open) as fn: for src in plugin["parameters"]["files"]: src = os.path.relpath(src) if os.path.dirname(src) != "": __logger__.warn( "files not in current folder may cause wrong location in the container, please check it {}" .format(src)) fn.add(src) to_screen("{} archived and wait to be uploaded".format(src)) self.client.get_storage().upload(local_path=self.temp_archive, remote_path="{}/source/{}".format( work_directory, os.path.basename( self.temp_archive)), overwrite=True)
def submit(self, cluster_alias: str = None, virtual_cluster: str = None): cluster_alias = na(cluster_alias, self.param("cluster_alias", None)) self.select_cluster(cluster_alias, virtual_cluster) self.validate().local_process() to_screen("submit job %s to cluster %s" % (self.name, cluster_alias)) try: self.client.rest_api_submit(self.get_config()) job_link = self.client.get_job_link(self.name) return {"job_link": job_link, "job_name": self.name} except Exception as identifier: to_screen(f"submit failed due to {repr(identifier)}", _type="error") to_screen(self.get_config()) raise identifier
def main(): try: eng = Engine() result = eng.process(sys.argv[1:]) if result: to_screen(result) return 0 except AssertionError as identifier: to_screen(f"Value error: {repr(identifier)}", _type="error") return 1 except Exception as identifier: to_screen(f"Error: {repr(identifier)}", _type="error") return 2 else: return -1
def add(self, elem: dict, getter=dict.get, silent: bool = False, replace: bool = False): for i in self.filter_index(self._fn_get(elem)): if replace: self[i] = elem if not silent: to_screen( f"OrganizedList: {self._key} = {self._fn_get(elem)} already exists, replace it" ) else: self[i].update(elem) if not silent: to_screen( f"OrderedDict: {self._key} = {self._fn_get(elem)} already exists, update it" ) return self # ~ return self.append(elem) if not silent: to_screen( f"OrganizedList: {self._key} = {self._fn_get(elem)} added") return self
def do_action_connect(self, args): to_screen("retrieving job config from cluster") self.__job__.load(job_name=args.job_name, cluster_alias=args.cluster_alias) return self.connect_notebook()
def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs): to_screen("upload %s -> %s" % (local_path, remote_path)) return self.client.upload(local_path=local_path, hdfs_path=remote_path, **kwargs)
def update(self, key: str, value=None, delete: bool = False): if not self.allow(key): to_screen(f"{key} is not a recognized default variable, ignored") return dic = self.values if delete: if key not in dic: to_screen(f"key {key} not found in {self.name}, ignored") elif not self.act_append( key) or not value: # delete the key when not append action del dic[key] to_screen( f"key {key} removed completely from {self.name} successfully" ) else: dic[key].remove(value) to_screen( f"{value} removed in {key} under {self.name} successfully") else: if self.act_append(key): def _append(dic, key, value): dic.setdefault(key, []) if value not in dic[key]: dic[key].append(value) _append(dic, key, value) to_screen( f"{value} added to {key} under {self.name} successfully") else: dic[key] = value to_screen( f"{key} set to {value} under {self.name} successfully") if self.file: to_file(self.values, self.file)
def do_action_delete(self, args): if self.__clusters__.delete(args.cluster_alias): to_screen("cluster %s deleted" % args.cluster_alias) return None
def process_args(self, args): to_screen(f'Parsed arguments {args}', _type="debug") if not args.scene: self.parser.print_help() return return self.scenes[args.scene].process(args)
def process(self, a: list): to_screen(f'Received arguments {a}', _type="debug") args = self.parser.parse_args(a) return self.process_args(args)
def do_action_stop(self, args): client = self.__clusters__.get_client(args.cluster_alias) for job_name in args.job_names: to_screen(client.rest_api_execute_job(job_name, "STOP"))