def test_show_logs_match(mocker): es_client_mock = mocker.patch( "commands.common.logs_utils.K8sElasticSearchClient") es_client_instance = es_client_mock.return_value es_client_instance.get_experiment_logs_generator.return_value = TEST_LOG_ENTRIES get_kubectl_host_mock = mocker.patch( 'commands.common.logs_utils.get_kubectl_host') get_api_key_mock = mocker.patch('commands.common.logs_utils.get_api_key') get_current_namespace_mock = mocker.patch( 'commands.common.logs_utils.get_kubectl_current_context_namespace') fake_experiment_1_name = 'fake-experiment-1' fake_experiment_2_name = 'fake-experiment-2' list_runs_mock = mocker.patch('commands.common.logs_utils.Run.list') list_runs_mock.return_value = [ Run(name=fake_experiment_1_name, experiment_name=fake_experiment_1_name), Run(name=fake_experiment_2_name, experiment_name=fake_experiment_2_name) ] runner = CliRunner() result = runner.invoke(logs.logs, ['-m', 'fake-experiment']) assert get_kubectl_host_mock.call_count == 1, 'kubectl host was not retrieved' assert get_api_key_mock.call_count == 1, 'k8s api key was not retrieved' assert get_current_namespace_mock.call_count == 1, 'namespace was not retrieved' assert list_runs_mock.call_count == 1, 'run was not retrieved' assert es_client_instance.get_experiment_logs_generator.call_count == 2, 'Experiment logs were not retrieved' assert fake_experiment_1_name in result.output assert fake_experiment_2_name in result.output
def prepare_list_of_runs(parameter_range: List[Tuple[str, str]], experiment_name: str, parameter_set: Tuple[str, ...], template_name: str) -> List[Run]: run_list: List[Run] = [] if not parameter_range and not parameter_set: run_list = [ Run(name=experiment_name, experiment_name=experiment_name, pod_selector={ 'matchLabels': { 'app': template_name, 'release': experiment_name } }) ] else: list_of_range_parameters: List[Tuple[str, ...]] = [("", )] list_of_set_parameters = [("", )] if parameter_range: list_of_range_parameters = analyze_pr_parameters_list( parameter_range) if parameter_set: list_of_set_parameters = analyze_ps_parameters_list(parameter_set) run_index = 1 for set_param in list_of_set_parameters: for range_param in list_of_range_parameters: current_run_name = experiment_name + "-" + str(run_index) current_params: Tuple[str, ...] = () if len(set_param) >= 1 and set_param[0]: current_params = set_param if len(range_param) >= 1 and range_param[0]: current_params = current_params + range_param run_list.append( Run(name=current_run_name, experiment_name=experiment_name, parameters=current_params, pod_selector={ 'matchLabels': { 'app': template_name, 'release': current_run_name } })) run_index = run_index + 1 return run_list
def generate_name_for_existing_exps(script_name: str, namespace: str, run_kind: RunKinds = RunKinds.TRAINING) -> Tuple[Optional[str], Dict[str, str]]: exp_list = list_k8s_experiments_by_label(namespace=namespace, label_selector=f"script_name={script_name},name_origin") if not exp_list or len(exp_list) == 0: return None, {} # 1. Find newest experiment name newest_exp = None for exp in exp_list: if not newest_exp: newest_exp = exp elif exp.metadata.creation_timestamp > newest_exp.metadata.creation_timestamp: newest_exp = exp name_origin = newest_exp.metadata.labels['name_origin'] names_of_experiments_with_the_same_origin = [] for exp in exp_list: if exp.metadata.labels['name_origin'] == name_origin: names_of_experiments_with_the_same_origin.append(exp.metadata.name) # 2. Count experiments(runs) matching the same origin name of an experiment runs_of_exp_list = Run.list(namespace=namespace, exp_name_filter=names_of_experiments_with_the_same_origin) counter = 1 if runs_of_exp_list: counter = len(runs_of_exp_list) + 1 calculated_name = f"{name_origin}-{counter}" return calculated_name, prepare_label(script_name, calculated_name, name_origin, run_kind=run_kind)
def list(cls, namespace: str = None, custom_objects_api: CustomObjectsApi = None): """ Return list of users. :namespace: :return: List of User objects """ logger.debug('Listing users.') k8s_custom_object_api = custom_objects_api if custom_objects_api else PlatformResourceApiClient.get( ) raw_users = k8s_custom_object_api.list_cluster_custom_object( group=cls.api_group_name, plural=cls.crd_plural_name, version=cls.crd_version) users = [ User.from_k8s_response_dict(user_dict) for user_dict in raw_users['items'] ] # Get experiment runs for each user # TODO: CHANGE IMPLEMENTATION TO USE AGGREGATED USER DATA AFTER CAN-366 runs = Run.list(custom_objects_api=k8s_custom_object_api) user_map = {user.name: user for user in users} for run in runs: if user_map.get(run.namespace): user_map[run.namespace].experiment_runs.append(run) else: logger.error( f"Run exists for nonexisting user {run.namespace}") return users
def test_show_logs_failure(mocker): es_client_mock = mocker.patch( 'commands.common.logs_utils.K8sElasticSearchClient') es_client_instance = es_client_mock.return_value es_client_instance.get_experiment_logs_generator.side_effect = RuntimeError get_kubectl_host_mock = mocker.patch( 'commands.common.logs_utils.get_kubectl_host') get_api_key_mock = mocker.patch('commands.common.logs_utils.get_api_key') get_current_namespace_mock = mocker.patch( 'commands.common.logs_utils.get_kubectl_current_context_namespace') fake_experiment_name = 'fake-experiment' list_runs_mock = mocker.patch('commands.common.logs_utils.Run.list') list_runs_mock.return_value = [ Run(name=fake_experiment_name, experiment_name=fake_experiment_name) ] runner = CliRunner() result = runner.invoke(logs.logs, [fake_experiment_name]) assert get_kubectl_host_mock.call_count == 1, 'kubectl host was not retrieved' assert get_api_key_mock.call_count == 1, 'k8s api key was not retrieved' assert get_current_namespace_mock.call_count == 1, 'namespace was not retrieved' assert list_runs_mock.call_count == 1, 'run was not retrieved' assert es_client_instance.get_experiment_logs_generator.call_count == 1, 'Experiment logs retrieval was not called' assert result.exit_code == 1
def test_show_logs_from_two_experiments(mocker): es_client_mock = mocker.patch( 'commands.common.logs_utils.K8sElasticSearchClient') es_client_instance = es_client_mock.return_value es_client_instance.get_experiment_logs_generator.return_value = TEST_LOG_ENTRIES get_kubectl_host_mock = mocker.patch( 'commands.common.logs_utils.get_kubectl_host') get_api_key_mock = mocker.patch('commands.common.logs_utils.get_api_key') get_current_namespace_mock = mocker.patch( "commands.common.logs_utils.get_kubectl_current_context_namespace") fake_experiment_name = 'fake-experiment' list_runs_mock = mocker.patch('commands.common.logs_utils.Run.list') list_runs_mock.return_value = [ Run(name=fake_experiment_name, experiment_name=fake_experiment_name) ] runner = CliRunner() m = mock_open() with patch("builtins.open", m) as open_mock: exception = RuntimeError() exception.message = "Cause of an error" open_mock.return_value.__enter__.side_effect = exception result = runner.invoke(logs.logs, ['fake-experiment', '-o'], input='y') assert CmdsCommonTexts.LOGS_STORING_ERROR.format( exception_message=exception.message) in result.output assert get_kubectl_host_mock.call_count == 1, 'kubectl host was not retrieved' assert get_api_key_mock.call_count == 1, 'k8s api key was not retrieved' assert get_current_namespace_mock.call_count == 1, "namespace was not retrieved" assert list_runs_mock.call_count == 1, "run was not retrieved" assert es_client_instance.get_experiment_logs_generator.call_count == 1, "Experiment logs were not retrieved"
def create_fake_run(experiment: Experiment) -> Run: return Run(name=experiment.name, experiment_name=experiment.name, metrics={}, parameters=experiment.parameters_spec, pod_count=0, pod_selector={}, state=RunStatus.CREATING, namespace=experiment.namespace, creation_timestamp=experiment.creation_timestamp, template_name=experiment.template_name, template_version=experiment.template_version)
def test_show_logs_to_file_success(mocker): es_client_mock = mocker.patch( "commands.common.logs_utils.K8sElasticSearchClient") es_client_instance = es_client_mock.return_value es_client_instance.get_experiment_logs_generator.return_value = TEST_LOG_ENTRIES get_kubectl_host_mock = mocker.patch( 'commands.common.logs_utils.get_kubectl_host') get_api_key_mock = mocker.patch('commands.common.logs_utils.get_api_key') get_current_namespace_mock = mocker.patch( "commands.common.logs_utils.get_kubectl_current_context_namespace") fake_experiment_name = 'fake-experiment' list_runs_mock = mocker.patch('commands.common.logs_utils.Run.list') list_runs_mock.return_value = [ Run(name=fake_experiment_name, experiment_name=fake_experiment_name) ] runner = CliRunner() m = mock_open() with patch("builtins.open", m) as open_mock: runner.invoke(logs.logs, ['fake-experiment', '-o'], input='y') assert get_kubectl_host_mock.call_count == 1, 'kubectl host was not retrieved' assert get_api_key_mock.call_count == 1, 'k8s api key was not retrieved' assert get_current_namespace_mock.call_count == 1, "namespace was not retrieved" assert list_runs_mock.call_count == 1, "run was not retrieved" assert es_client_instance.get_experiment_logs_generator.call_count == 1, "Experiment logs were not retrieved" assert open_mock.call_count == 1, "File wasn't saved."
def test_show_logs_failure_proxy_problem(mocker, exception): es_client_mock = mocker.patch( 'commands.experiment.logs.K8sElasticSearchClient') es_client_instance = es_client_mock.return_value es_client_instance.get_experiment_logs_generator.side_effect = RuntimeError proxy_mock = mocker.patch.object(logs, 'K8sProxy') proxy_mock.side_effect = exception get_current_namespace_mock = mocker.patch( 'commands.experiment.logs.get_kubectl_current_context_namespace') fake_experiment_name = 'fake-experiment' list_runs_mock = mocker.patch('commands.experiment.logs.Run.list') list_runs_mock.return_value = [ Run(name=fake_experiment_name, experiment_name=fake_experiment_name) ] runner = CliRunner() result = runner.invoke(logs.logs, [fake_experiment_name]) assert proxy_mock.call_count == 1, 'port forwarding was not initiated' assert get_current_namespace_mock.call_count == 0, 'namespace was retrieved' assert list_runs_mock.call_count == 0, 'run was retrieved' assert es_client_instance.get_experiment_logs_generator.call_count == 0, 'Experiment logs retrieval was called' assert result.exit_code == 1
def stream(state: State, name: str, data: str, method_verb: InferenceVerb): """ Perform stream inference task on launched prediction instance. """ method_verb = InferenceVerb(method_verb) try: namespace = get_kubectl_current_context_namespace() # TODO: check if kind field of inference instance Run is correct inference_instance = Run.get(name=name, namespace=namespace) if not inference_instance: handle_error(user_msg=Texts.INSTANCE_NOT_EXISTS_ERROR_MSG.format( name=name)) exit(1) if not inference_instance.state == RunStatus.RUNNING: handle_error(user_msg=Texts.INSTANCE_NOT_RUNNING_ERROR_MSG.format( name=name, running_code=RunStatus.RUNNING.value)) exit(1) inference_instance_url = get_inference_instance_url( inference_instance=inference_instance) stream_url = f'{inference_instance_url}:{method_verb.value}' except Exception: handle_error(logger, Texts.INSTANCE_GET_FAIL_ERROR_MSG.format(name=name), Texts.INSTANCE_GET_FAIL_ERROR_MSG.format(name=name), add_verbosity_msg=state.verbosity == 0) exit(1) try: with open(data, 'r', encoding='utf-8') as data_file: stream_data = json.load(data_file) except (json.JSONDecodeError, IOError): handle_error(logger, Texts.JSON_LOAD_ERROR_MSG.format(data=data), Texts.JSON_LOAD_ERROR_MSG.format(data=data)) exit(1) try: api_key = get_api_key() headers = { 'Authorization': api_key, 'Accept': 'application/json', 'Content-Type': 'application/json' } with spinner(text=Texts.WAITING_FOR_RESPONSE_MSG): stream_response = requests.post( stream_url, data=json.dumps(stream_data), # nosec - request to k8s cluster verify=False, headers=headers) stream_response.raise_for_status() click.echo(stream_response.text) except Exception as e: error_msg = Texts.INFERENCE_OTHER_ERROR_MSG.format(exception=e) if hasattr(e, 'response'): error_msg += Texts.INFERENCE_ERROR_RESPONSE_MSG.format( response_text=e.response.text) # type: ignore handle_error(logger, error_msg, error_msg) exit(1)
def test_list_runs_from_namespace(mock_k8s_api_client: CustomObjectsApi): raw_runs_single_namespace = dict(LIST_RUNS_RESPONSE_RAW) raw_runs_single_namespace['items'] = [raw_runs_single_namespace['items'][0]] mock_k8s_api_client.list_namespaced_custom_object.return_value = raw_runs_single_namespace runs = Run.list(namespace='namespace-1') assert [TEST_RUNS[0]] == runs
def test_create_list_of_runs_ps_only(mocker): experiment_name = "experiment_name" template_name = "template_name" mocker.patch("platform_resources.experiment_utils.generate_exp_name_and_labels", side_effect=[(experiment_name, {})]) multiple_two_params = ("{param1:0, param2:1}", "{param1:2,param3:3}") multiple_two_params_list_result = \ [Run(name=experiment_name + "-1", experiment_name=experiment_name, parameters=("param1=0", "param2=1")), Run(name=experiment_name + "-2", experiment_name=experiment_name, parameters=("param1=2", "param3=3"))] output = prepare_list_of_runs(parameter_range=[], experiment_name=experiment_name, parameter_set=multiple_two_params, template_name=template_name) assert len(output) == 2 for expected_run, result_run in zip(multiple_two_params_list_result, output): assert expected_run.parameters == result_run.parameters
def test_create_list_of_runs_pr_only(mocker): experiment_name = "experiment_name" template_name = "template_name" mocker.patch( "platform_resources.experiment_utils.generate_exp_name_and_labels", side_effect=[(experiment_name, {})]) two_params_list = [("param1", "{0, 1}"), ("param2", "{0...2:1}")] two_params_list_result = \ [Run(name=experiment_name + "-1", experiment_name=experiment_name, parameters=("param1=0", "param2=0")), Run(name=experiment_name + "-2", experiment_name=experiment_name, parameters=("param1=0", "param2=1")), Run(name=experiment_name + "-3", experiment_name=experiment_name, parameters=("param1=0", "param2=2")), Run(name=experiment_name + "-4", experiment_name=experiment_name, parameters=("param1=1", "param2=0")), Run(name=experiment_name + "-5", experiment_name=experiment_name, parameters=("param1=1", "param2=1")), Run(name=experiment_name + "-6", experiment_name=experiment_name, parameters=("param1=1", "param2=2"))] output = prepare_list_of_runs(parameter_range=two_params_list, experiment_name=experiment_name, parameter_set=(), template_name=template_name) assert len(output) == 6 for expected_run, result_run in zip(two_params_list_result, output): assert expected_run.parameters == result_run.parameters
def list_runs_in_cli(verbosity_lvl: int, all_users: bool, name: str, status: RunStatus, listed_runs_kinds: List[RunKinds], runs_list_headers: List[str], with_metrics: bool, count: int = None, brief: bool = False): """ Display a list of selected runs in the cli. :param verbosity_lvl: level at which error messages should be logged or displayed :param all_users: whether to display runs regardless of their owner or not :param name: regular expression to which names of the shown runs have to match :param status: display runs with this status :param listed_runs_kinds: list of kinds of runs that will be listed out :param runs_list_headers: headers which will be displayed on top of a table shown in the cli :param with_metrics: whether to show metrics column or not :param count: number of rows displayed on a list. If not given - content of a list is not limited :param brief: when true only experiment name, submission date, owner and state will be print """ try: namespace = None if all_users else get_kubectl_current_context_namespace() status = RunStatus[status] if status else None # List experiments command is actually listing Run resources instead of Experiment resources with one # exception - if run is initialized - nctl displays data of an experiment instead of data of a run runs = replace_initializing_runs( Run.list(namespace=namespace, state_list=[status], name_filter=name, run_kinds_filter=listed_runs_kinds)) runs_representations = [run.cli_representation for run in runs] if brief: runs_table_data = [ (run_representation.name, run_representation.submission_date, run_representation.submitter, run_representation.status) for run_representation in runs_representations ] elif with_metrics: runs_table_data = runs_representations else: runs_table_data = [ (run_representation.name, run_representation.parameters, run_representation.submission_date, run_representation.start_date, run_representation.duration, run_representation.submitter, run_representation.status, run_representation.template_name) for run_representation in runs_representations ] click.echo(tabulate(runs_table_data if not count else runs_table_data[-count:], headers=runs_list_headers, tablefmt="orgtbl")) except InvalidRegularExpressionError: handle_error(logger, Texts.INVALID_REGEX_ERROR_MSG, Texts.INVALID_REGEX_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1)
def cancel_experiment(exp_name: str, runs_to_cancel: List[Run], namespace: str) -> Tuple[List[Run], List[Run]]: """ Cancel experiment with a given name by cancelling runs given as a parameter. If given experiment contains more runs than is in the list of runs - experiment's state remains intact. :param exp_name: name of an experiment to which belong runs passed in run_list parameter :param runs_to_cancel: list of runs that should be deleted, they have to belong to exp_name experiment :param namespace: namespace where experiment is located :return: two list - first contains runs that were cancelled successfully, second - those which weren't """ logger.debug(f"Cancelling {exp_name} experiment ...") deleted_runs: List[Run] = [] not_deleted_runs: List[Run] = [] experiment = Experiment.get(name=exp_name, namespace=namespace) if not experiment: raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG) experiment_runs = Run.list(namespace=namespace, exp_name_filter=[exp_name], excl_state=RunStatus.CANCELLED) # check whether experiment has more runs that should be cancelled cancel_whole_experiment = (len(experiment_runs) == len(runs_to_cancel)) if cancel_whole_experiment: experiment.state = ExperimentStatus.CANCELLING experiment.update() try: deleted_runs, not_deleted_runs = cancel_experiment_runs( runs_to_cancel=runs_to_cancel, namespace=namespace) if cancel_whole_experiment and not not_deleted_runs: try: # change an experiment state to CANCELLED experiment.state = ExperimentStatus.CANCELLED experiment.update() except Exception: # problems during deleting experiments are hidden as if runs were # cancelled user doesn't have a possibility to remove them logger.exception( "Error during cancelling Experiment resource.") except Exception: logger.exception("Error during cancelling experiment.") return deleted_runs, not_deleted_runs return deleted_runs, not_deleted_runs
def list_unitialized_experiments_in_cli(verbosity_lvl: int, all_users: bool, name: str, headers: List[str], listed_runs_kinds: List[RunKinds] = None, count: int = None, brief: bool = False): """ Display a list of selected runs in the cli. :param verbosity_lvl: level at which error messages should be logged or displayed :param all_users: whether to display runs regardless of their owner or not :param name: regular expression to which names of the shown runs have to match :param headers: headers which will be displayed on top of a table shown in the cli :param count: number of rows displayed on a list. If not given - content of a list is not limited """ if not listed_runs_kinds: listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER] try: namespace = None if all_users else get_kubectl_current_context_namespace() creating_experiments = Experiment.list(namespace=namespace, state=ExperimentStatus.CREATING, run_kinds_filter=listed_runs_kinds, name_filter=name) runs = Run.list(namespace=namespace, name_filter=name, run_kinds_filter=listed_runs_kinds) # Get Experiments without associated Runs names_of_experiment_with_runs = set() for run in runs: names_of_experiment_with_runs.add(run.experiment_name) uninitialized_experiments = [experiment for experiment in creating_experiments if experiment.name not in names_of_experiment_with_runs] displayed_items_count = count if count else len(uninitialized_experiments) click.echo(tabulate([uninitialized_experiment_cli_representation(experiment) for experiment in uninitialized_experiments][-displayed_items_count:], headers=headers, tablefmt="orgtbl")) except InvalidRegularExpressionError: handle_error(logger, Texts.INVALID_REGEX_ERROR_MSG, Texts.INVALID_REGEX_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1)
def test_show_logs_success(mocker): es_client_mock = mocker.patch('commands.common.K8sElasticSearchClient') es_client_instance = es_client_mock.return_value es_client_instance.get_experiment_logs_generator.return_value = TEST_LOG_ENTRIES proxy_mock = mocker.patch.object(common, 'K8sProxy') get_current_namespace_mock = mocker.patch('commands.common.get_kubectl_current_context_namespace') fake_experiment_name = 'fake-experiment' list_runs_mock = mocker.patch('commands.common.Run.list') list_runs_mock.return_value = [Run(name=fake_experiment_name, experiment_name=fake_experiment_name)] runner = CliRunner() runner.invoke(logs.logs, [fake_experiment_name]) assert proxy_mock.call_count == 1, 'port forwarding was not initiated' assert get_current_namespace_mock.call_count == 1, 'namespace was not retrieved' assert list_runs_mock.call_count == 1, 'run was not retrieved' assert es_client_instance.get_experiment_logs_generator.call_count == 1, 'Experiment logs were not retrieved'
def test_list_runs_name_filter(mock_k8s_api_client: CustomObjectsApi): mock_k8s_api_client.list_cluster_custom_object.return_value = LIST_RUNS_RESPONSE_RAW runs = Run.list(name_filter=TEST_RUNS[1].name) assert [TEST_RUNS[1]] == runs
def get_logs(experiment_name: str, min_severity: SeverityLevel, start_date: str, end_date: str, pod_ids: str, pod_status: PodStatus, match: str, output: bool, pager: bool, follow: bool, runs_kinds: List[RunKinds], instance_type: str): """ Show logs for a given experiment. """ # check whether we have runs with a given name if experiment_name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG.format(instance_type=instance_type)) exit(1) elif not experiment_name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG.format(instance_type=instance_type)) exit(1) try: with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False) namespace = get_kubectl_current_context_namespace() if match: experiment_name = match name_filter = match else: name_filter = f'^{experiment_name}$' runs = Run.list(namespace=namespace, name_filter=name_filter, run_kinds_filter=runs_kinds) if not runs: raise ValueError(f'Run with given name: {experiment_name} does not exists in namespace {namespace}.') pod_ids = pod_ids.split(',') if pod_ids else None min_severity = SeverityLevel[min_severity] if min_severity else None pod_status = PodStatus[pod_status] if pod_status else None follow_logs = True if follow and not output else False if output and len(runs) > 1: click.echo(Texts.MORE_EXP_LOGS_MESSAGE) for run in runs: start_date = start_date if start_date else run.creation_timestamp run_logs_generator = es_client.get_experiment_logs_generator(run=run, namespace=namespace, min_severity=min_severity, start_date=start_date, end_date=end_date, pod_ids=pod_ids, pod_status=pod_status, follow=follow_logs) if output: save_logs_to_file(run=run, run_logs_generator=run_logs_generator, instance_type=instance_type) else: if len(runs) > 1: click.echo(f'Experiment : {run.name}') print_logs(run_logs_generator=run_logs_generator, pager=pager) except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG, Texts.PROXY_CLOSE_USER_ERROR_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error(logger, Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message), Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG, Texts.PROXY_CREATION_ERROR_MSG) exit(1) except ValueError: handle_error(logger, Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(experiment_name=experiment_name, instance_type=instance_type.capitalize()), Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(experiment_name=experiment_name, instance_type=instance_type.capitalize())) exit(1) except Exception: handle_error(logger, Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type), Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type)) exit(1)
from unittest.mock import MagicMock from kubernetes.client import V1Pod, V1PodStatus, V1Event, V1ObjectReference, V1ObjectMeta from commands.experiment import view from platform_resources.run import Run, RunStatus from platform_resources.experiment import Experiment from cli_text_consts import ExperimentViewCmdTexts as Texts from util.k8s.k8s_statistics import ResourceUsage from util.k8s.k8s_info import PodStatus TEST_RUNS = [ Run(name='test-experiment', parameters=['a 1', 'b 2'], creation_timestamp='2018-04-26T13:43:01Z', namespace='namespace-1', state=RunStatus.RUNNING, template_name='test-ex-template', metrics={'any metrics': 'a'}, experiment_name='experiment_name', pod_count=1, pod_selector={}), Run(name='test-experiment-2', parameters=['a 1', 'b 2'], creation_timestamp='2018-05-08T13:05:04Z', namespace='namespace-2', state=RunStatus.COMPLETE, template_name='test-ex-template', metrics={'any metrics': 'a'}, experiment_name='experiment_name', pod_count=1, pod_selector={}) ]
def get_logs(experiment_name: str, min_severity: SeverityLevel, start_date: str, end_date: str, pod_ids: str, pod_status: PodStatus, match: str, output: bool, pager: bool, follow: bool, runs_kinds: List[RunKinds], instance_type: str): """ Show logs for a given experiment. """ # check whether we have runs with a given name if experiment_name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG.format( instance_type=instance_type)) exit(1) elif not experiment_name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG.format( instance_type=instance_type)) exit(1) try: es_client = K8sElasticSearchClient( host=f'{get_kubectl_host(with_port=True)}' f'/api/v1/namespaces/nauta/services/nauta-elasticsearch:nauta/proxy', verify_certs=False, use_ssl=True, headers={'Authorization': get_api_key()}) namespace = get_kubectl_current_context_namespace() if match: experiment_name = match name_filter = match else: name_filter = f'^{experiment_name}$' runs = Run.list(namespace=namespace, name_filter=name_filter, run_kinds_filter=runs_kinds) if not runs: raise ValueError( f'Run with given name: {experiment_name} does not exists in namespace {namespace}.' ) pod_ids = pod_ids.split(',') if pod_ids else None # type: ignore follow_logs = True if follow and not output else False if output and len(runs) > 1: click.echo(Texts.MORE_EXP_LOGS_MESSAGE) for run in runs: start_date = start_date if start_date else run.creation_timestamp run_logs_generator = es_client.get_experiment_logs_generator( run=run, namespace=namespace, min_severity=min_severity, start_date=start_date, end_date=end_date, pod_ids=pod_ids, pod_status=pod_status, follow=follow_logs) if output: save_logs_to_file(logs_generator=run_logs_generator, instance_name=run.name, instance_type=instance_type) else: if len(runs) > 1: click.echo(f'Experiment : {run.name}') print_logs(run_logs_generator=run_logs_generator, pager=pager) except ValueError: handle_error( logger, Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format( experiment_name=experiment_name, instance_type=instance_type.capitalize()), Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format( experiment_name=experiment_name, instance_type=instance_type.capitalize())) exit(1) except Exception: handle_error( logger, Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type), Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type)) exit(1)
def cancel(state: State, name: str, match: str, purge: bool, pod_ids: str, pod_status: str, listed_runs_kinds: List[RunKinds] = None): """ Cancels chosen experiments based on a name provided as a parameter. """ if not listed_runs_kinds: listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER] # check whether we have runs with a given name if name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG) exit(1) if not name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG) exit(1) current_namespace = get_current_namespace() if pod_ids or pod_status: if not name: name = match cancel_pods_mode(namespace=current_namespace, run_name=name, pod_ids=pod_ids, pod_status=pod_status) exit(0) search_for_experiment = False exp_to_be_cancelled = None if name: exp_to_be_cancelled = Experiment.get(namespace=current_namespace, name=name) exp_to_be_cancelled_kind = RunKinds(exp_to_be_cancelled.metadata['labels'].get('runKind')) \ if exp_to_be_cancelled else None exp_to_be_cancelled = exp_to_be_cancelled if exp_to_be_cancelled_kind in listed_runs_kinds else None if exp_to_be_cancelled: search_for_experiment = True else: name = f"^{name}$" else: name = match list_of_all_runs = None list_of_applicable_states = [RunStatus.QUEUED, RunStatus.RUNNING] if purge: list_of_applicable_states.extend( [RunStatus.FAILED, RunStatus.COMPLETE, RunStatus.CANCELLED]) try: if search_for_experiment: list_of_all_runs = Run.list(namespace=current_namespace, exp_name_filter=[name], run_kinds_filter=listed_runs_kinds) else: list_of_all_runs = Run.list(namespace=current_namespace, name_filter=name, run_kinds_filter=listed_runs_kinds) except Exception: handle_error( logger, Texts.LIST_RUNS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural), Texts.LIST_RUNS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural)) exit(1) # Handle cancellation of experiments with no associated Runs if exp_to_be_cancelled and not list_of_all_runs: cancel_uninitialized_experiment(experiment=exp_to_be_cancelled, namespace=current_namespace, purge=purge) if not list_of_all_runs: handle_error(user_msg=Texts.LACK_OF_EXPERIMENTS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural, experiment_name=experiment_name)) exit(1) elif not purge and not [ run for run in list_of_all_runs if run.state in [RunStatus.QUEUED, RunStatus.RUNNING] ]: handle_error( user_msg=Texts.LACK_OF_EXP_TO_BE_CANCELLED_ERROR_MSG.format( experiment_name_plural=experiment_name_plural)) exit(1) # check whether we have at least one experiment in state other than CANCELLED list_of_runs_to_be_deleted: List[Run] = [] names_of_cancelled_runs: List[str] = [] if not purge: # check whether we have at least one experiment in state other than CANCELLED for run in list_of_all_runs: if run.state in list_of_applicable_states: list_of_runs_to_be_deleted.append(run) else: names_of_cancelled_runs.append(run.name) if not list_of_runs_to_be_deleted: handle_error( user_msg=Texts.EXPERIMENTS_ALREADY_CANCELLED_ERROR_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts. DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) exit(1) elif len(list_of_runs_to_be_deleted) != len(list_of_all_runs): click.echo( Texts.ALREADY_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for name in names_of_cancelled_runs: click.echo(f" - {name}") click.echo( Texts.CAN_BE_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") else: click.echo( Texts.WILL_BE_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") else: list_of_runs_to_be_deleted = list_of_all_runs click.echo( Texts.WILL_BE_PURGED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") if not click.confirm( Texts.CONFIRM_CANCEL_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deletion"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancellation"])): handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts. DELETE_OPERATION["deletion"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancellation"])) exit(0) # group runs by experiments exp_with_runs: defaultdict = defaultdict(list) for run in list_of_runs_to_be_deleted: exp_with_runs[run.experiment_name].append(run) deleted_runs = [] not_deleted_runs = [] if purge: # Connect to elasticsearch in order to purge run logs try: with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient( host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False, with_admin_privledges=is_current_user_administrator()) for exp_name, run_list in exp_with_runs.items(): try: exp_del_runs, exp_not_del_runs = purge_experiment( exp_name=exp_name, runs_to_purge=run_list, namespace=current_namespace, k8s_es_client=es_client) deleted_runs.extend(exp_del_runs) not_deleted_runs.extend(exp_not_del_runs) except Exception: handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG) not_deleted_runs.extend(run_list) except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSING_ERROR_LOG_MSG, Texts.PROXY_CLOSING_ERROR_USER_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error( logger, Texts.PORT_OCCUPIED_ERROR_LOG_MSG, Texts.PORT_OCCUPIED_ERROR_USER_MSG.format( exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_OPEN_ERROR_MSG, Texts.PROXY_OPEN_ERROR_MSG) exit(1) else: for exp_name, run_list in exp_with_runs.items(): try: exp_del_runs, exp_not_del_runs = cancel_experiment( exp_name=exp_name, runs_to_cancel=run_list, namespace=current_namespace) deleted_runs.extend(exp_del_runs) not_deleted_runs.extend(exp_not_del_runs) except Exception: handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG) not_deleted_runs.extend(run_list) if deleted_runs: click.echo( Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in deleted_runs: click.echo(f" - {run.name}") if not_deleted_runs: click.echo( Texts.FAILED_TO_CANCEL_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in not_deleted_runs: click.echo(f" - {run.name}") sys.exit(1)
def purge_experiment(exp_name: str, runs_to_purge: List[Run], k8s_es_client: K8sElasticSearchClient, namespace: str) -> Tuple[List[Run], List[Run]]: """ Purge experiment with a given name by cancelling runs given as a parameter. If given experiment contains more runs than is in the list of runs - experiment's state remains intact. :param exp_name: name of an experiment to which belong runs passed in run_list parameter :param runs_to_purge: list of runs that should be purged, they have to belong to exp_name experiment :param k8s_es_client: Kubernetes ElasticSearch client :param namespace: namespace where experiment is located :return: two list - first contains runs that were cancelled successfully, second - those which weren't """ logger.debug(f"Purging {exp_name} experiment ...") purged_runs: List[Run] = [] not_purged_runs: List[Run] = [] experiment = Experiment.get(name=exp_name, namespace=namespace) if not experiment: raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG) experiment_runs = Run.list(namespace=namespace, exp_name_filter=[exp_name]) # check whether experiment has more runs that should be cancelled cancel_whole_experiment = (len(experiment_runs) == len(runs_to_purge)) if cancel_whole_experiment: experiment.state = ExperimentStatus.CANCELLING experiment.update() try: cancelled_runs, not_cancelled_runs = cancel_experiment_runs( runs_to_cancel=runs_to_purge, namespace=namespace) not_purged_runs = not_cancelled_runs if cancel_whole_experiment: # Delete associated workflows experiment_associated_workflows = [ wf for wf in ArgoWorkflow.list(namespace=namespace) if wf.labels.get('experimentName') == experiment.name ] for wf in experiment_associated_workflows: wf.delete() # Remove tags from git repo manager try: delete_exp_tag_from_git_repo_manager( experiment_name=experiment.name, username=namespace, experiments_workdir=get_run_environment_path('')) except Exception: handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG, Texts.GIT_REPO_MANAGER_ERROR_MSG) raise for run in cancelled_runs: logger.debug(f"Purging {run.name} run ...") click.echo(Texts.PURGING_START_MSG.format(run_name=run.name)) try: with spinner(text=Texts.PURGING_PROGRESS_MSG.format( run_name=run.name)): # purge helm release delete_helm_release(run.name, namespace=namespace, purge=True) # delete run kubectl.delete_k8s_object("run", run.name) purged_runs.append(run) except Exception as exe: not_purged_runs.append(run) logger.exception("Error during purging runs.") # occurence of NotFound error may mean, that run has been removed earlier if "NotFound" not in str(exe): click.echo( Texts.INCOMPLETE_PURGE_ERROR_MSG.format( experiment_name=experiment_name)) raise exe try: # clear run logs if is_current_user_administrator(): logger.debug(f"Clearing logs for {run.name} run.") with spinner(text=Texts.PURGING_LOGS_PROGRESS_MSG.format( run_name=run.name)): k8s_es_client.delete_logs_for_run(run=run.name, namespace=namespace) except Exception: logger.exception("Error during clearing run logs.") # CAN-1099 - docker garbage collector has errors that prevent from correct removal of images # try: # try to remove images from docker registry # delete_images_for_experiment(exp_name=run.name) # except Exception: # logger.exception("Error during removing images.") if cancel_whole_experiment and not not_purged_runs: try: kubectl.delete_k8s_object("experiment", exp_name) except Exception: # problems during deleting experiments are hidden as if runs were # cancelled user doesn't have a possibility to remove them logger.exception("Error during purging experiment.") except Exception: logger.exception("Error during purging experiment.") return purged_runs, not_purged_runs return purged_runs, not_purged_runs
state=ExperimentStatus.CREATING, template_name='test-ex-template', template_namespace='test-ex-namespace', metadata={'labels': { 'runKind': 'training' }}) RUN_QUEUED = Run( name="exp-mnist-single-node.py-18.05.17-16.05.45-1-tf-training", parameters=['mnist_single_node.py', '--data_dir', '/app'], state=RunStatus.QUEUED, metrics={'accuracy': 52.322}, experiment_name="experiment-1", pod_count=1, pod_selector={ 'matchLabels': { 'app': 'tf-training', 'draft': 'exp-mnist-single-node.py-18.05.17-16.05.45-1', 'release': 'exp-mnist-single-node.py-18.05.17-16.05.45-1' } }, namespace="mciesiel-dev", creation_timestamp="2018-05-17T14:05:52Z", template_name="tf-training") RUN_CANCELLED = Run( name="exp-mnist-single-node.py-18.05.17-16.05.45-1-tf-training", parameters=['mnist_single_node.py', '--data_dir', '/app'], state=RunStatus.CANCELLED, metrics={'accuracy': 52.322}, experiment_name="experiment-name-will-be-added-soon", pod_count=1,
def test_list_runs(mock_k8s_api_client): mock_k8s_api_client.list_cluster_custom_object.return_value = LIST_RUNS_RESPONSE_RAW runs = Run.list() assert runs == TEST_RUNS
creation_timestamp='2018-04-26T13:43:01Z', namespace='namespace-1', state=ExperimentStatus.CREATING, template_name='jupyter', template_namespace='test-ex-namespace') NON_JUPYTER_EXPERIMENT = Experiment(name='test-experiment-2', parameters_spec=['a 1', 'b 2'], creation_timestamp='2018-05-08T13:05:04Z', namespace='namespace-2', state=ExperimentStatus.SUBMITTED, template_name='test-ex-template', template_namespace='test-ex-namespace') SUBMITTED_RUNS = [ Run(name="exp-mnist-single-node.py-18.05.17-16.05.45-1-tf-training", experiment_name=CORRECT_INTERACT_NAME, state=RunStatus.QUEUED) ] KO_EXPERIMENT = KubernetesObject(spec=JUPYTER_EXPERIMENT, metadata=client.V1ObjectMeta()) class InteractMocks: def __init__(self, mocker): self.mocker = mocker self.get_namespace = mocker.patch( "commands.experiment.interact.get_kubectl_current_context_namespace", side_effect=[EXPERIMENT_NAMESPACE, EXPERIMENT_NAMESPACE]) self.get_experiment = mocker.patch( "commands.experiment.interact.Experiment.get", return_value=None)
def test_create_list_of_runs_pr_and_ps(mocker): experiment_name = "experiment_name" template_name = "template_name" mocker.patch( "platform_resources.experiment_utils.generate_exp_name_and_labels", side_effect=[(experiment_name, {})]) two_params_list = [("param1", "{0, 1}"), ("param2", "{0...2:1}")] multiple_two_params = ("{param3:0, param4:1}", "{param3:2,param4:3}") expected_result = [ Run(name=experiment_name + "-1", experiment_name=experiment_name, parameters=("param3=0", "param4=1", "param1=0", "param2=0")), Run(name=experiment_name + "-2", experiment_name=experiment_name, parameters=("param3=0", "param4=1", "param1=0", "param2=1")), Run(name=experiment_name + "-3", experiment_name=experiment_name, parameters=("param3=0", "param4=1", "param1=0", "param2=2")), Run(name=experiment_name + "-4", experiment_name=experiment_name, parameters=("param3=0", "param4=1", "param1=1", "param2=0")), Run(name=experiment_name + "-5", experiment_name=experiment_name, parameters=("param3=0", "param4=1", "param1=1", "param2=1")), Run(name=experiment_name + "-6", experiment_name=experiment_name, parameters=("param3=0", "param4=1", "param1=1", "param2=2")), Run(name=experiment_name + "-7", experiment_name=experiment_name, parameters=("param3=2", "param4=3", "param1=0", "param2=0")), Run(name=experiment_name + "-8", experiment_name=experiment_name, parameters=("param3=2", "param4=3", "param1=0", "param2=1")), Run(name=experiment_name + "-9", experiment_name=experiment_name, parameters=("param3=2", "param4=3", "param1=0", "param2=2")), Run(name=experiment_name + "-10", experiment_name=experiment_name, parameters=("param3=2", "param4=3", "param1=1", "param2=0")), Run(name=experiment_name + "-11", experiment_name=experiment_name, parameters=("param3=2", "param4=3", "param1=1", "param2=1")), Run(name=experiment_name + "-12", experiment_name=experiment_name, parameters=("param3=2", "param4=3", "param1=1", "param2=2")) ] output = prepare_list_of_runs(two_params_list, experiment_name, multiple_two_params, template_name=template_name) assert len(output) == 12 for expected_run, result_run in zip(expected_result, output): assert expected_run.parameters == result_run.parameters
def test_list_runs_filter_status(mock_k8s_api_client: CustomObjectsApi): mock_k8s_api_client.list_cluster_custom_object.return_value = LIST_RUNS_RESPONSE_RAW runs = Run.list(state_list=[RunStatus.QUEUED]) assert [TEST_RUNS[0]] == runs
# limitations under the License. # import dateutil from commands.common import list_utils from platform_resources.experiment import Experiment from platform_resources.run import Run, RunStatus TEST_RUNS = [ Run(name='test-experiment', parameters=('a 1', 'b 2'), metrics={ 'acc': 52.2, 'loss': 1.62345 }, creation_timestamp='2018-04-26T13:43:01Z', namespace='namespace-1', state=RunStatus.QUEUED, experiment_name='test-experiment', pod_count=0, pod_selector={}), Run(name='test-experiment-2', parameters=('a 1', 'b 2'), metrics={ 'acc': 52.2, 'loss': 1.62345 }, creation_timestamp='2018-05-08T13:05:04Z', namespace='namespace-2', state=RunStatus.COMPLETE, experiment_name='test-experiment',
def view(context, state: State, experiment_name: str, tensorboard: bool, username: str): """ Displays details of an experiment. """ try: if username: namespace = username else: namespace = get_kubectl_current_context_namespace() run = Run.get(name=experiment_name, namespace=namespace) if not run: handle_error(user_msg=Texts.EXPERIMENT_NOT_FOUND_ERROR_MSG.format( experiment_name=experiment_name)) exit(2) click.echo( tabulate([run.cli_representation], headers=EXPERIMENTS_LIST_HEADERS, tablefmt="orgtbl")) click.echo(Texts.PODS_PARTICIPATING_LIST_HEADER) pods = get_namespaced_pods(label_selector="runName=" + experiment_name, namespace=namespace) tabular_output = [] containers_resources = [] pending_pods = [] for pod in pods: status_string = "" if pod.status.conditions: for cond in pod.status.conditions: msg = "\n" if not cond.reason else "\n reason: " + \ wrap_text(cond.reason, width=POD_CONDITIONS_MAX_WIDTH) msg = msg + ", \n message: " + wrap_text(cond.message, width=POD_CONDITIONS_MAX_WIDTH) \ if cond.message else msg status_string += wrap_text( cond.type + ": " + cond.status, width=POD_CONDITIONS_MAX_WIDTH) + msg + "\n" else: pod_events = get_pod_events(namespace=namespace, name=pod.metadata.name) for event in pod_events: msg = "\n" if not event.reason else "\n reason: " + \ wrap_text(event.reason, width=POD_CONDITIONS_MAX_WIDTH) msg = msg + ", \n message: " + wrap_text(event.message, width=POD_CONDITIONS_MAX_WIDTH) \ if event.message else msg status_string += msg + "\n" if pod.status.phase.upper() == PodStatus.PENDING.value: pending_pods.append(pod.metadata.name) container_statuses = defaultdict(lambda: None) if pod.status.container_statuses: for container_status in pod.status.container_statuses: container_statuses[ container_status.name] = container_status.state container_details = [] for container in pod.spec.containers: container_description = Texts.CONTAINER_DETAILS_MSG.format( name=container.name, status=container_status_to_msg( container_statuses[container.name]), volumes=container_volume_mounts_to_msg( container.volume_mounts, spaces=2), resources=container_resources_to_msg(container.resources, spaces=4)) container_details.append(container_description) containers_resources.append(container.resources) container_details = ''.join(container_details) tabular_output.append([ pod.metadata.name, wrap_text(pod.metadata.uid, width=UID_MAX_WIDTH, spaces=0), status_string, container_details ]) click.echo( tabulate(tabular_output, Texts.PODS_TABLE_HEADERS, tablefmt="orgtbl")) try: cpu_requests_sum = sum_cpu_resources([ container_resource.requests["cpu"] for container_resource in containers_resources if container_resource.requests and container_resource.requests.get("cpu") ]) mem_requests_sum = sum_mem_resources([ container_resource.requests["memory"] for container_resource in containers_resources if container_resource.requests and container_resource.requests.get("memory") ]) cpu_limits_sum = sum_cpu_resources([ container_resource.limits["cpu"] for container_resource in containers_resources if container_resource.limits and container_resource.limits.get("cpu") ]) mem_limits_sum = sum_mem_resources([ container_resource.limits["memory"] for container_resource in containers_resources if container_resource.limits and container_resource.limits.get("memory") ]) except ValueError as exception: handle_error( logger, Texts.RESOURCES_SUM_PARSING_ERROR_MSG.format( error_msg=str(exception)), Texts.RESOURCES_SUM_PARSING_ERROR_MSG.format( error_msg=str(exception))) click.echo(Texts.RESOURCES_SUM_LIST_HEADER) click.echo( tabulate(list( zip(Texts.RESOURCES_SUM_TABLE_ROWS_HEADERS, [ cpu_requests_sum, mem_requests_sum, cpu_limits_sum, mem_limits_sum ])), Texts.RESOURCES_SUM_TABLE_HEADERS, tablefmt="orgtbl")) if tensorboard: click.echo() context.invoke(tensorboard_command, experiment_name=[experiment_name]) if pending_pods: click.echo() try: cpu = False memory = False for pod in pending_pods: events_list = get_pod_events(namespace=namespace, name=pod) for event in events_list: if "insufficient cpu" in event.message.lower(): cpu = True elif "insufficient memory" in event.message.lower(): memory = True if cpu and memory: break if cpu and memory: break if not cpu and not memory: exit(0) if cpu and memory: resources = "number of cpus and amount of memory" elif cpu: resources = "number of cpus" else: resources = "amount of memory" click.echo( Texts.INSUFFICIENT_RESOURCES_MESSAGE.format( resources=resources)) click.echo() top_cpu_users, top_mem_users = get_highest_usage() click.echo( Texts.TOP_CPU_CONSUMERS.format(consumers=", ".join([ res.user_name for res in top_cpu_users[0:3 if len(top_cpu_users ) > 2 else len(top_cpu_users)] ]))) click.echo( Texts.TOP_MEMORY_CONSUMERS.format(consumers=", ".join([ res.user_name for res in top_mem_users[0:3 if len(top_mem_users ) > 2 else len(top_mem_users)] ]))) except Exception: click.echo(Texts.PROBLEMS_WHILE_GATHERING_USAGE_DATA) logger.exception( Texts.PROBLEMS_WHILE_GATHERING_USAGE_DATA_LOGS) except Exception: handle_error(logger, Texts.VIEW_OTHER_ERROR_MSG, Texts.VIEW_OTHER_ERROR_MSG) exit(1)