def replace_initializing_runs(run_list: List[Run]): """ Creates a list of runs with initializing runs replaced by fake runs created based on experiment data. If there is at least one initializing run within a certain experiment - none of runs creating this experiment is displayed. :param run_list: list of runs to be checked :return: list without runs that are initialized at the moment """ initializing_experiments: set = set() ret_list = [] for run in run_list: exp_name = run.experiment_name experiment = Experiment.get(name=exp_name, namespace=run.namespace) if (run.state is None or run.state == '') and exp_name not in initializing_experiments: ret_list.append(create_fake_run(experiment)) initializing_experiments.add(exp_name) elif exp_name not in initializing_experiments: if experiment: run.template_version = experiment.template_version else: run.template_version = None ret_list.append(run) return ret_list
def generate_exp_name_and_labels(script_name: str, namespace: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING) -> Tuple[str, Dict[str, str]]: if script_name: script_name = Path(script_name).name if name: # CASE 1: If user pass name as param, then use it. If experiment with this name exists - return error experiment = Experiment.get(namespace=namespace, name=name) experiment_runs: List[Run] = experiment.get_runs() if experiment else [] if experiment and experiment_runs: raise SubmitExperimentError(Texts.EXPERIMENT_ALREADY_EXISTS_ERROR_MSG.format(name=name)) # subcase when experiment has no associated runs. if experiment and not experiment_runs: raise SubmitExperimentError(Texts.EXPERIMENT_INVALID_STATE_MSG.format(name=name)) # if there are still artifacts from previous experiment with the same name if list_pods(namespace=namespace, label_selector=f'runName={name}'): raise SubmitExperimentError(Texts.EXPERIMENT_PREV_EXP_STILL_TERMINATING) return name, prepare_label(script_name, name, name, run_kind=run_kind) else: # CASE 2: If user submit exp without name, but there is already exp with the same script name, then: # --> use existing exp name and add post-fix with next index generated_name, labels = generate_name_for_existing_exps(script_name, namespace, run_kind=run_kind) if generated_name: return generated_name, labels # CASE 3: If user submit exp without name and there is no existing exps with matching script name,then: # --> generate new name result = generate_name(script_name) experiments = Experiment.list(namespace=namespace, name_filter=result) if experiments and len(experiments) > 0: result = f'{result}-{len(experiments)}' return result, prepare_label(script_name, result, run_kind=run_kind) return result, prepare_label(script_name, result, run_kind=run_kind)
def cancel_experiment(exp_name: str, runs_to_cancel: List[Run], namespace: str) -> Tuple[List[Run], List[Run]]: """ Cancel experiment with a given name by cancelling runs given as a parameter. If given experiment contains more runs than is in the list of runs - experiment's state remains intact. :param exp_name: name of an experiment to which belong runs passed in run_list parameter :param runs_to_cancel: list of runs that should be deleted, they have to belong to exp_name experiment :param namespace: namespace where experiment is located :return: two list - first contains runs that were cancelled successfully, second - those which weren't """ logger.debug(f"Cancelling {exp_name} experiment ...") deleted_runs: List[Run] = [] not_deleted_runs: List[Run] = [] experiment = Experiment.get(name=exp_name, namespace=namespace) if not experiment: raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG) experiment_runs = Run.list(namespace=namespace, exp_name_filter=[exp_name], excl_state=RunStatus.CANCELLED) # check whether experiment has more runs that should be cancelled cancel_whole_experiment = (len(experiment_runs) == len(runs_to_cancel)) if cancel_whole_experiment: experiment.state = ExperimentStatus.CANCELLING experiment.update() try: deleted_runs, not_deleted_runs = cancel_experiment_runs( runs_to_cancel=runs_to_cancel, namespace=namespace) if cancel_whole_experiment and not not_deleted_runs: try: # change an experiment state to CANCELLED experiment.state = ExperimentStatus.CANCELLED experiment.update() except Exception: # problems during deleting experiments are hidden as if runs were # cancelled user doesn't have a possibility to remove them logger.exception( "Error during cancelling Experiment resource.") except Exception: logger.exception("Error during cancelling experiment.") return deleted_runs, not_deleted_runs return deleted_runs, not_deleted_runs
def interact(ctx: click.Context, name: str, filename: str, pack_param: List[Tuple[str, str]], no_launch: bool, port_number: int, env: List[str], template: str): """ Starts an interactive session with Jupyter Notebook. """ current_namespace = get_kubectl_current_context_namespace() jupyters_number = calculate_number_of_running_jupyters(current_namespace) if jupyters_number > ACCEPTED_NUMBER_OF_NOTEBOOKS: if not click.confirm( Texts.TOO_MANY_JUPYTERS.format( jupyter_number=str(jupyters_number))): click.echo(Texts.INTERACT_ABORT_MSG) sys.exit(0) create_new_notebook = True jupyter_experiment = None if name: try: jupyter_experiment = Experiment.get(name=name, namespace=current_namespace) if jupyter_experiment and filename: handle_error(user_msg=Texts.FILENAME_BUT_SESSION_EXISTS) sys.exit(1) if jupyter_experiment: metadata = jupyter_experiment.metadata if metadata and metadata.get("labels") and metadata.get( "labels").get("script_name"): filename = metadata.get("labels").get("script_name") except Exception: handle_error(logger, Texts.EXPERIMENT_GET_ERROR_MSG, Texts.EXPERIMENT_GET_ERROR_MSG) sys.exit(1) # if experiment exists and is not based on jupyter image - we need to ask a user to choose another name if jupyter_experiment and jupyter_experiment.template_name not in JUPYTER_NOTEBOOK_TEMPLATES_NAMES: handle_error(user_msg=Texts.NAME_ALREADY_USED.format(name=name)) sys.exit(1) # if experiment exists but its state is different than RUNNING - display info about a need of purging of # this experiment if jupyter_experiment and jupyter_experiment.state not in \ [ExperimentStatus.SUBMITTED, ExperimentStatus.CREATING]: handle_error( user_msg=Texts.EXP_WITH_THE_SAME_NAME_MUST_BE_PURGED.format( name=name)) sys.exit(1) if not jupyter_experiment and ( not click.get_current_context().obj.force and not click.confirm(Texts.CONFIRM_EXPERIMENT_CREATION)): sys.exit(0) if jupyter_experiment: create_new_notebook = False else: try: check_experiment_name(value=name) except click.BadParameter as exe: handle_error(user_msg=str(exe)) sys.exit(1) number_of_retries = 0 if create_new_notebook: number_of_retries = 5 try: exp_name = name if not name and not filename: exp_name = generate_name("jup") click.echo(Texts.SUBMITTING_EXPERIMENT_USER_MSG) runs, runs_errors, filename = submit_experiment( run_kind=RunKinds.JUPYTER, script_location=filename, script_folder_location=None, template=template, name=exp_name, parameter_range=[], parameter_set=(), script_parameters=(), pack_params=pack_param, env_variables=env) click.echo( tabulate( { RUN_NAME: [run.cli_representation.name for run in runs], RUN_PARAMETERS: [run.cli_representation.parameters for run in runs], RUN_STATUS: [run.cli_representation.status for run in runs], RUN_MESSAGE: [runs_errors.get(run.name, "") for run in runs] }, headers=[ RUN_NAME, RUN_PARAMETERS, RUN_STATUS, RUN_MESSAGE ], tablefmt=TBLT_TABLE_FORMAT)) if runs: name = runs[0].name else: # run wasn't created - error raise RuntimeError("Run wasn't created") except K8sProxyCloseError as exe: handle_error(user_msg=exe.message) sys.exit(1) except SubmitExperimentError as exe: handle_error( logger, Texts.SUBMIT_ERROR_MSG.format(exception_message=exe.message), Texts.SUBMIT_ERROR_MSG.format(exception_message=exe.message)) sys.exit(1) except Exception: handle_error(logger, Texts.SUBMIT_OTHER_ERROR_MSG, Texts.SUBMIT_OTHER_ERROR_MSG) sys.exit(1) else: # if jupyter service exists - the system only connects to it click.echo(Texts.SESSION_EXISTS_MSG) url_end = "" if filename: # only Jupyter notebooks are opened directly, other files are opened in edit mode url_end = f"/notebooks/output/experiment/" if jupyter_experiment and filename.endswith(".py"): filename = filename[:filename.index(".py", -3)] + ".ipynb" if not filename.endswith(".ipynb"): url_end = "/edit/" url_end = url_end + Path(filename).name # wait until all jupyter pods are ready for i in range(JUPYTER_CHECK_POD_READY_TRIES): try: if check_pods_status(run_name=name, namespace=current_namespace, status=PodStatus.RUNNING): break except Exception: handle_error(logger, Texts.NOTEBOOK_STATE_CHECK_ERROR_MSG) sys.exit(1) time.sleep(1) else: handle_error(user_msg=Texts.NOTEBOOK_NOT_READY_ERROR_MSG) sys.exit(1) try: launch_app(k8s_app_name=NAUTAAppNames.JUPYTER, app_name=name, no_launch=no_launch, number_of_retries=number_of_retries, url_end=url_end, port=port_number) except LaunchError as exe: handle_error(logger, exe.message, exe.message) sys.exit(1) except ProxyClosingError: handle_error(user_msg=Texts.PROXY_CLOSING_ERROR_MSG) sys.exit(1) except Exception: handle_error(logger, Texts.SESSION_LAUNCH_OTHER_ERROR_MSG, Texts.SESSION_LAUNCH_OTHER_ERROR_MSG) sys.exit(1)
def cancel(state: State, name: str, match: str, purge: bool, pod_ids: str, pod_status: str, listed_runs_kinds: List[RunKinds] = None): """ Cancels chosen experiments based on a name provided as a parameter. """ if not listed_runs_kinds: listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER] # check whether we have runs with a given name if name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG) exit(1) if not name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG) exit(1) current_namespace = get_current_namespace() if pod_ids or pod_status: if not name: name = match cancel_pods_mode(namespace=current_namespace, run_name=name, pod_ids=pod_ids, pod_status=pod_status) exit(0) search_for_experiment = False exp_to_be_cancelled = None if name: exp_to_be_cancelled = Experiment.get(namespace=current_namespace, name=name) exp_to_be_cancelled_kind = RunKinds(exp_to_be_cancelled.metadata['labels'].get('runKind')) \ if exp_to_be_cancelled else None exp_to_be_cancelled = exp_to_be_cancelled if exp_to_be_cancelled_kind in listed_runs_kinds else None if exp_to_be_cancelled: search_for_experiment = True else: name = f"^{name}$" else: name = match list_of_all_runs = None list_of_applicable_states = [RunStatus.QUEUED, RunStatus.RUNNING] if purge: list_of_applicable_states.extend( [RunStatus.FAILED, RunStatus.COMPLETE, RunStatus.CANCELLED]) try: if search_for_experiment: list_of_all_runs = Run.list(namespace=current_namespace, exp_name_filter=[name], run_kinds_filter=listed_runs_kinds) else: list_of_all_runs = Run.list(namespace=current_namespace, name_filter=name, run_kinds_filter=listed_runs_kinds) except Exception: handle_error( logger, Texts.LIST_RUNS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural), Texts.LIST_RUNS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural)) exit(1) # Handle cancellation of experiments with no associated Runs if exp_to_be_cancelled and not list_of_all_runs: cancel_uninitialized_experiment(experiment=exp_to_be_cancelled, namespace=current_namespace, purge=purge) if not list_of_all_runs: handle_error(user_msg=Texts.LACK_OF_EXPERIMENTS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural, experiment_name=experiment_name)) exit(1) elif not purge and not [ run for run in list_of_all_runs if run.state in [RunStatus.QUEUED, RunStatus.RUNNING] ]: handle_error( user_msg=Texts.LACK_OF_EXP_TO_BE_CANCELLED_ERROR_MSG.format( experiment_name_plural=experiment_name_plural)) exit(1) # check whether we have at least one experiment in state other than CANCELLED list_of_runs_to_be_deleted: List[Run] = [] names_of_cancelled_runs: List[str] = [] if not purge: # check whether we have at least one experiment in state other than CANCELLED for run in list_of_all_runs: if run.state in list_of_applicable_states: list_of_runs_to_be_deleted.append(run) else: names_of_cancelled_runs.append(run.name) if not list_of_runs_to_be_deleted: handle_error( user_msg=Texts.EXPERIMENTS_ALREADY_CANCELLED_ERROR_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts. DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) exit(1) elif len(list_of_runs_to_be_deleted) != len(list_of_all_runs): click.echo( Texts.ALREADY_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for name in names_of_cancelled_runs: click.echo(f" - {name}") click.echo( Texts.CAN_BE_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") else: click.echo( Texts.WILL_BE_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") else: list_of_runs_to_be_deleted = list_of_all_runs click.echo( Texts.WILL_BE_PURGED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") if not click.confirm( Texts.CONFIRM_CANCEL_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deletion"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancellation"])): handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts. DELETE_OPERATION["deletion"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancellation"])) exit(0) # group runs by experiments exp_with_runs: defaultdict = defaultdict(list) for run in list_of_runs_to_be_deleted: exp_with_runs[run.experiment_name].append(run) deleted_runs = [] not_deleted_runs = [] if purge: # Connect to elasticsearch in order to purge run logs try: with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient( host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False, with_admin_privledges=is_current_user_administrator()) for exp_name, run_list in exp_with_runs.items(): try: exp_del_runs, exp_not_del_runs = purge_experiment( exp_name=exp_name, runs_to_purge=run_list, namespace=current_namespace, k8s_es_client=es_client) deleted_runs.extend(exp_del_runs) not_deleted_runs.extend(exp_not_del_runs) except Exception: handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG) not_deleted_runs.extend(run_list) except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSING_ERROR_LOG_MSG, Texts.PROXY_CLOSING_ERROR_USER_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error( logger, Texts.PORT_OCCUPIED_ERROR_LOG_MSG, Texts.PORT_OCCUPIED_ERROR_USER_MSG.format( exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_OPEN_ERROR_MSG, Texts.PROXY_OPEN_ERROR_MSG) exit(1) else: for exp_name, run_list in exp_with_runs.items(): try: exp_del_runs, exp_not_del_runs = cancel_experiment( exp_name=exp_name, runs_to_cancel=run_list, namespace=current_namespace) deleted_runs.extend(exp_del_runs) not_deleted_runs.extend(exp_not_del_runs) except Exception: handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG) not_deleted_runs.extend(run_list) if deleted_runs: click.echo( Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in deleted_runs: click.echo(f" - {run.name}") if not_deleted_runs: click.echo( Texts.FAILED_TO_CANCEL_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in not_deleted_runs: click.echo(f" - {run.name}") sys.exit(1)
def purge_experiment(exp_name: str, runs_to_purge: List[Run], k8s_es_client: K8sElasticSearchClient, namespace: str) -> Tuple[List[Run], List[Run]]: """ Purge experiment with a given name by cancelling runs given as a parameter. If given experiment contains more runs than is in the list of runs - experiment's state remains intact. :param exp_name: name of an experiment to which belong runs passed in run_list parameter :param runs_to_purge: list of runs that should be purged, they have to belong to exp_name experiment :param k8s_es_client: Kubernetes ElasticSearch client :param namespace: namespace where experiment is located :return: two list - first contains runs that were cancelled successfully, second - those which weren't """ logger.debug(f"Purging {exp_name} experiment ...") purged_runs: List[Run] = [] not_purged_runs: List[Run] = [] experiment = Experiment.get(name=exp_name, namespace=namespace) if not experiment: raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG) experiment_runs = Run.list(namespace=namespace, exp_name_filter=[exp_name]) # check whether experiment has more runs that should be cancelled cancel_whole_experiment = (len(experiment_runs) == len(runs_to_purge)) if cancel_whole_experiment: experiment.state = ExperimentStatus.CANCELLING experiment.update() try: cancelled_runs, not_cancelled_runs = cancel_experiment_runs( runs_to_cancel=runs_to_purge, namespace=namespace) not_purged_runs = not_cancelled_runs if cancel_whole_experiment: # Delete associated workflows experiment_associated_workflows = [ wf for wf in ArgoWorkflow.list(namespace=namespace) if wf.labels.get('experimentName') == experiment.name ] for wf in experiment_associated_workflows: wf.delete() # Remove tags from git repo manager try: delete_exp_tag_from_git_repo_manager( experiment_name=experiment.name, username=namespace, experiments_workdir=get_run_environment_path('')) except Exception: handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG, Texts.GIT_REPO_MANAGER_ERROR_MSG) raise for run in cancelled_runs: logger.debug(f"Purging {run.name} run ...") click.echo(Texts.PURGING_START_MSG.format(run_name=run.name)) try: with spinner(text=Texts.PURGING_PROGRESS_MSG.format( run_name=run.name)): # purge helm release delete_helm_release(run.name, namespace=namespace, purge=True) # delete run kubectl.delete_k8s_object("run", run.name) purged_runs.append(run) except Exception as exe: not_purged_runs.append(run) logger.exception("Error during purging runs.") # occurence of NotFound error may mean, that run has been removed earlier if "NotFound" not in str(exe): click.echo( Texts.INCOMPLETE_PURGE_ERROR_MSG.format( experiment_name=experiment_name)) raise exe try: # clear run logs if is_current_user_administrator(): logger.debug(f"Clearing logs for {run.name} run.") with spinner(text=Texts.PURGING_LOGS_PROGRESS_MSG.format( run_name=run.name)): k8s_es_client.delete_logs_for_run(run=run.name, namespace=namespace) except Exception: logger.exception("Error during clearing run logs.") # CAN-1099 - docker garbage collector has errors that prevent from correct removal of images # try: # try to remove images from docker registry # delete_images_for_experiment(exp_name=run.name) # except Exception: # logger.exception("Error during removing images.") if cancel_whole_experiment and not not_purged_runs: try: kubectl.delete_k8s_object("experiment", exp_name) except Exception: # problems during deleting experiments are hidden as if runs were # cancelled user doesn't have a possibility to remove them logger.exception("Error during purging experiment.") except Exception: logger.exception("Error during purging experiment.") return purged_runs, not_purged_runs return purged_runs, not_purged_runs
def view(context, state: State, experiment_name: str, tensorboard: bool, username: str): """ Displays details of an experiment. """ try: if username: namespace = username else: namespace = get_kubectl_current_context_namespace() run = Run.get(name=experiment_name, namespace=namespace) if not run: handle_error(user_msg=Texts.EXPERIMENT_NOT_FOUND_ERROR_MSG.format( experiment_name=experiment_name)) exit(2) experiment = Experiment.get(name=experiment_name, namespace=namespace) if experiment: run.template_version = experiment.template_version click.echo( tabulate([run.cli_representation], headers=EXPERIMENTS_LIST_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) click.echo(Texts.PODS_PARTICIPATING_LIST_HEADER) pods = get_namespaced_pods(label_selector="runName=" + experiment_name, namespace=namespace) tabular_output = [] containers_resources = [] pending_pods = [] for pod in pods: status_string = "" if pod.status.conditions: for cond in pod.status.conditions: msg = "\n" if not cond.reason else "\n reason: " + \ wrap_text(cond.reason, width=POD_CONDITIONS_MAX_WIDTH) msg = msg + ", \n message: " + wrap_text(cond.message, width=POD_CONDITIONS_MAX_WIDTH) \ if cond.message else msg status_string += wrap_text( cond.type + ": " + cond.status, width=POD_CONDITIONS_MAX_WIDTH) + msg + "\n" else: pod_events = get_pod_events(namespace=namespace, name=pod.metadata.name) for event in pod_events: msg = "\n" if not event.reason else "\n reason: " + \ wrap_text(event.reason, width=POD_CONDITIONS_MAX_WIDTH) msg = msg + ", \n message: " + wrap_text(event.message, width=POD_CONDITIONS_MAX_WIDTH) \ if event.message else msg status_string += msg + "\n" if pod.status.phase.upper() == PodStatus.PENDING.value: pending_pods.append(pod.metadata.name) container_statuses = defaultdict(lambda: None) # type: ignore if pod.status.container_statuses: for container_status in pod.status.container_statuses: container_statuses[ container_status.name] = container_status.state container_details = [] for container in pod.spec.containers: container_description = Texts.CONTAINER_DETAILS_MSG.format( name=container.name, status=container_status_to_msg( container_statuses[container.name]), volumes=container_volume_mounts_to_msg( container.volume_mounts, spaces=2), resources=container_resources_to_msg(container.resources, spaces=4)) container_details.append(container_description) containers_resources.append(container.resources) container_details_string = ''.join(container_details) tabular_output.append([ pod.metadata.name, wrap_text(pod.metadata.uid, width=UID_MAX_WIDTH, spaces=0), status_string, container_details_string ]) click.echo( tabulate(tabular_output, Texts.PODS_TABLE_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) try: cpu_requests_sum = sum_cpu_resources([ container_resource.requests["cpu"] for container_resource in containers_resources if container_resource.requests and container_resource.requests.get("cpu") ]) mem_requests_sum = sum_mem_resources([ container_resource.requests["memory"] for container_resource in containers_resources if container_resource.requests and container_resource.requests.get("memory") ]) cpu_limits_sum = sum_cpu_resources([ container_resource.limits["cpu"] for container_resource in containers_resources if container_resource.limits and container_resource.limits.get("cpu") ]) mem_limits_sum = sum_mem_resources([ container_resource.limits["memory"] for container_resource in containers_resources if container_resource.limits and container_resource.limits.get("memory") ]) except ValueError as exception: handle_error( logger, Texts.RESOURCES_SUM_PARSING_ERROR_MSG.format( error_msg=str(exception)), Texts.RESOURCES_SUM_PARSING_ERROR_MSG.format( error_msg=str(exception))) click.echo(Texts.RESOURCES_SUM_LIST_HEADER) click.echo( tabulate(list( zip(Texts.RESOURCES_SUM_TABLE_ROWS_HEADERS, [ cpu_requests_sum, mem_requests_sum, cpu_limits_sum, mem_limits_sum ])), Texts.RESOURCES_SUM_TABLE_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) if tensorboard: click.echo() context.invoke(tensorboard_command, experiment_name=[experiment_name]) if pending_pods: click.echo() try: cpu = False memory = False for pod in pending_pods: events_list = get_pod_events(namespace=namespace, name=pod) for event in events_list: if "insufficient cpu" in event.message.lower(): cpu = True elif "insufficient memory" in event.message.lower(): memory = True if cpu and memory: break if cpu and memory: break if not cpu and not memory: exit(0) if cpu and memory: resources = "number of cpus and amount of memory" elif cpu: resources = "number of cpus" else: resources = "amount of memory" click.echo( Texts.INSUFFICIENT_RESOURCES_MESSAGE.format( resources=resources)) click.echo() top_cpu_users, top_mem_users = get_highest_usage() click.echo( Texts.TOP_CPU_CONSUMERS.format(consumers=", ".join([ res.user_name for res in top_cpu_users[0:3 if len(top_cpu_users ) > 2 else len(top_cpu_users)] ]))) click.echo( Texts.TOP_MEMORY_CONSUMERS.format(consumers=", ".join([ res.user_name for res in top_mem_users[0:3 if len(top_mem_users ) > 2 else len(top_mem_users)] ]))) except Exception: click.echo(Texts.PROBLEMS_WHILE_GATHERING_USAGE_DATA) logger.exception( Texts.PROBLEMS_WHILE_GATHERING_USAGE_DATA_LOGS) except Exception: handle_error(logger, Texts.VIEW_OTHER_ERROR_MSG, Texts.VIEW_OTHER_ERROR_MSG) exit(1)