def test_wait_for_connection_readiness(mocker): mocker.patch('requests.get') fake_address = 'localhost' fake_port = 1234 # noinspection PyProtectedMember K8sProxy._wait_for_connection_readiness(fake_address, fake_port) # noinspection PyUnresolvedReferences requests.get.assert_called_once_with(f'http://{fake_address}:{fake_port}')
def test_wait_for_connection_readiness_many_tries_failure(mocker): fake_address = 'localhost' fake_port = 1234 mocker.patch('requests.get', side_effect=ConnectionError) mocker.patch('time.sleep') with pytest.raises(TunnelSetupError): # noinspection PyProtectedMember K8sProxy._wait_for_connection_readiness(fake_address, fake_port, 15) # noinspection PyUnresolvedReferences assert requests.get.call_count == 15
def test_wait_for_connection_readiness_many_tries(mocker): effect = [ConnectionError for _ in range(10)] # noinspection PyTypeChecker effect.append(None) fake_address = 'localhost' fake_port = 1234 mocker.patch('requests.get', side_effect=effect) mocker.patch('time.sleep') # noinspection PyProtectedMember K8sProxy._wait_for_connection_readiness(fake_address, fake_port, 15) # noinspection PyUnresolvedReferences assert requests.get.call_count == 11
def tensorboard(state: State, no_launch: bool, tensorboard_service_client_port: Optional[int], port_number: Optional[int], experiment_name: List[str]): """ Subcommand for launching tensorboard with credentials """ current_namespace = get_kubectl_current_context_namespace() with spinner(Texts.TB_WAITING_MSG) as proxy_spinner, \ K8sProxy(nauta_app_name=NAUTAAppNames.TENSORBOARD_SERVICE, app_name='tensorboard-service', namespace=current_namespace, port=tensorboard_service_client_port) as proxy: tensorboard_service_client = TensorboardServiceClient( address=f'http://127.0.0.1:{proxy.tunnel_port}') requested_runs = build_tensorboard_run_list( exp_list=experiment_name, current_namespace=current_namespace) # noinspection PyBroadException try: tb = tensorboard_service_client.create_tensorboard(requested_runs) if tb.invalid_runs: list_of_invalid_runs = ', '.join([ f'{item.get("owner")}/{item.get("name")}' for item in tb.invalid_runs ]) click.echo( Texts.TB_INVALID_RUNS_MSG.format( invalid_runs=list_of_invalid_runs)) except Exception as exe: err_message = Texts.TB_CREATE_ERROR_MSG if hasattr( exe, 'error_code' ) and exe.error_code == HTTPStatus.UNPROCESSABLE_ENTITY: # type: ignore err_message = str(exe) handle_error(logger, err_message, err_message, add_verbosity_msg=state.verbosity == 0) sys.exit(1) for i in range(TENSORBOARD_TRIES_COUNT): # noinspection PyTypeChecker # tb.id is str tb = tensorboard_service_client.get_tensorboard(tb.id) if not tb: continue if tb.status == TensorboardStatus.RUNNING: proxy_spinner.hide() launch_app_with_proxy(k8s_app_name=NAUTAAppNames.TENSORBOARD, no_launch=no_launch, namespace=current_namespace, port=port_number, app_name=f"tensorboard-{tb.id}") return logger.warning( Texts.TB_WAITING_FOR_TB_MSG.format( tb_id=tb.id, tb_status_value=tb.status.value)) sleep(TENSORBOARD_CHECK_BACKOFF_SECONDS) click.echo(Texts.TB_TIMEOUT_ERROR_MSG) sys.exit(2)
def upgrade(ctx: click.Context): """ Upgrade users after Nauta upgrade. """ with spinner(text=Texts.UPGRADE_IN_PROGRESS): # noinspection PyBroadException try: # noinspection PyTypeChecker users: List[User] = User.list() with K8sProxy(NAUTAAppNames.GIT_REPO_MANAGER, number_of_retries_wait_for_readiness=60) as proxy: grm_client = GitRepoManagerClient(host='127.0.0.1', port=proxy.tunnel_port) for user in users: grm_user = grm_client.get_user(user.name) if not grm_user: grm_client.add_nauta_user(user.name) except Exception: handle_error(logger, Texts.UPGRADE_FAILED, Texts.UPGRADE_FAILED, add_verbosity_msg=ctx.obj.verbosity == 0) sys.exit(1) click.echo(Texts.UPGRADE_SUCCEEDED)
def logs(state: State, workflow_name: str): try: namespace = get_kubectl_current_context_namespace() workflow: ArgoWorkflow = ArgoWorkflow.get(namespace=namespace, name=workflow_name) if not workflow: click.echo(Texts.NOT_FOUND_MSG.format(workflow_name=workflow_name)) exit(0) with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False) start_date = workflow.started_at workflow_logs_generator = es_client.get_argo_workflow_logs_generator(workflow=workflow, namespace=namespace, start_date=start_date) for log_entry in workflow_logs_generator: if not log_entry.content.isspace(): click.echo(f'{log_entry.date} {log_entry.pod_name} {log_entry.content}') except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG, Texts.PROXY_CLOSE_USER_ERROR_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error(logger, Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message), Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG, Texts.PROXY_CREATION_ERROR_MSG) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def add_user_to_git_repo_manager(username: str, state): try: with K8sProxy(NAUTAAppNames.GIT_REPO_MANAGER, number_of_retries_wait_for_readiness=30) as proxy: grm_client = GitRepoManagerClient(host='127.0.0.1', port=proxy.tunnel_port) grm_client.add_nauta_user(username=username) except Exception: handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG, Texts.GIT_REPO_MANAGER_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) raise
def launch_app(k8s_app_name: NAUTAAppNames, no_launch: bool = False, port: int = None, app_name: str = None, number_of_retries: int = 0, url_end: str = "", namespace: str = None): try: with spinner(text=Texts.LAUNCHING_APP_MSG) as proxy_spinner, \ K8sProxy(nauta_app_name=k8s_app_name, port=port, app_name=app_name, number_of_retries=number_of_retries, namespace=namespace) as proxy: url = FORWARDED_URL.format(proxy.tunnel_port, url_end) if k8s_app_name == NAUTAAppNames.INGRESS: config.load_kube_config() user_token = configuration.Configuration().api_key.get( 'authorization') prepared_user_token = user_token.replace('Bearer ', '') url = f'{url}?token={prepared_user_token}' if not no_launch: if is_gui_browser_available(): wait_for_connection(url) webbrowser.open_new(url) proxy_spinner.stop() else: click.echo(Texts.NO_WEB_BROWSER_ERROR_MSG) if port and port != proxy.tunnel_port: click.echo( Texts.CANNOT_USE_PORT.format( required_port=port, random_port=proxy.tunnel_port)) proxy_spinner.stop() click.echo(Texts.GO_TO_MSG.format(url=url)) click.echo(Texts.PROXY_CREATED_MSG) wait_for_ctrl_c() except K8sProxyCloseError: err_message = Texts.PROXY_CLOSE_ERROR_MSG.format(app_name=k8s_app_name) raise ProxyClosingError(err_message) except LocalPortOccupiedError as exe: err_message = Texts.PROXY_CREATED_EXTENDED_ERROR_MSG.format( app_name=k8s_app_name, reason=exe.message) raise LaunchError(err_message) except K8sProxyOpenError: error_msg = Texts.PROXY_CREATED_ERROR_MSG.format(app_name=k8s_app_name) logger.exception(error_msg) raise LaunchError(error_msg) except LaunchError as e: raise e except Exception: err_message = Texts.WEB_APP_LAUCH_FAIL_MSG logger.exception(err_message) raise LaunchError(err_message)
def test_set_up_proxy(mocker): spo_mock = mocker.patch("subprocess.Popen") spf_mock = mocker.patch("util.k8s.k8s_proxy_context_manager.kubectl.start_port_forwarding", return_value=(spo_mock, "1000", "1001")) mocker.patch("util.k8s.k8s_proxy_context_manager.K8sProxy._wait_for_connection_readiness") mocker.patch("psutil.Process") with K8sProxy(NAUTAAppNames.ELASTICSEARCH): pass assert spf_mock.call_count == 1 # noinspection PyProtectedMember,PyUnresolvedReferences assert K8sProxy._wait_for_connection_readiness.call_count == 1
def test_set_up_proxy_open_failure(mocker): spf_mock = mocker.patch("util.k8s.k8s_proxy_context_manager.kubectl.start_port_forwarding", side_effect=RuntimeError()) spc_mock = mocker.patch("subprocess.Popen.kill", side_effect=RuntimeError()) mocker.patch("util.k8s.k8s_proxy_context_manager.K8sProxy._wait_for_connection_readiness") with pytest.raises(K8sProxyOpenError): with K8sProxy(NAUTAAppNames.ELASTICSEARCH): pass assert spf_mock.call_count == 1 assert spc_mock.call_count == 0 # noinspection PyProtectedMember,PyUnresolvedReferences assert K8sProxy._wait_for_connection_readiness.call_count == 0
def delete_images_for_experiment(exp_name: str): """ Deletes image related to experiment with a given name. :param exp_name: name of an experiment for which image should be removed In case of any problems it raises an error """ with K8sProxy(NAUTAAppNames.DOCKER_REGISTRY) as proxy: # Save port that was actually used in configuration server_address = f"127.0.0.1:{proxy.tunnel_port}" list_of_tags = get_tags_list(server_address=server_address, image_name=exp_name) for tag in list_of_tags: delete_tag(server_address=server_address, image_name=exp_name, tag=tag)
def _debug_workflow_logs(workflow: ArgoWorkflow, namespace: str): try: with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False) start_date = workflow.started_at workflow_logs_generator = es_client.get_argo_workflow_logs_generator(workflow=workflow, namespace=namespace, start_date=start_date) log.debug(f'=== Workflow {workflow.name} logs ===') for log_entry in workflow_logs_generator: if not log_entry.content.isspace(): log.debug(f'{log_entry.date} {log_entry.pod_name} {log_entry.content}') log.debug(f'=== Workflow {workflow.name} logs ===') except Exception: log.exception(f'Failed to get {workflow.name} worklfow logs.')
def test_set_up_proxy_open_readiness_failure(mocker): popen_mock = mocker.patch("subprocess.Popen") mocker.patch("subprocess.Popen.kill") mocker.patch("subprocess.Popen.terminate") mocker.patch("util.k8s.k8s_proxy_context_manager.kubectl.start_port_forwarding", return_value=(popen_mock, 1000, 1001)) mocker.patch("util.k8s.k8s_proxy_context_manager.K8sProxy._wait_for_connection_readiness", side_effect=TunnelSetupError) mocker.patch("psutil.Process", return_value=mocker.MagicMock(children=lambda **kwargs: [])) mocker.patch("psutil.wait_procs") with pytest.raises(K8sProxyOpenError): with K8sProxy(NAUTAAppNames.ELASTICSEARCH): pass # noinspection PyUnresolvedReferences assert subprocess.Popen.kill.call_count == 1 or subprocess.Popen.terminate.call_count == 1
def submit_experiment(template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING, script_location: str = None, script_parameters: Tuple[str, ...] = None, pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None, parameter_set: Tuple[str, ...] = None, script_folder_location: str = None, env_variables: List[str] = None, requirements_file: str = None) -> (List[Run], Dict[str, str], str): script_parameters = script_parameters if script_parameters else () parameter_set = parameter_set if parameter_set else () parameter_range = parameter_range if parameter_range else [] log.debug("Submit experiment - start") try: namespace = get_kubectl_current_context_namespace() global submitted_namespace submitted_namespace = namespace except Exception: message = Texts.GET_NAMESPACE_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) try: with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG): experiment_name, labels = generate_exp_name_and_labels(script_name=script_location, namespace=namespace, name=name, run_kind=run_kind) runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range, parameter_set=parameter_set, template_name=template) except SubmitExperimentError as exe: log.exception(str(exe)) raise exe except Exception: message = Texts.SUBMIT_PREPARATION_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) global submitted_experiment submitted_experiment = experiment_name # Ctrl-C handling signal.signal(signal.SIGINT, ctrl_c_handler_for_submit) signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit) try: config = Config() # start port forwarding # noinspection PyBroadException with K8sProxy(NAUTAAppNames.DOCKER_REGISTRY, port=config.local_registry_port) as proxy: # Save port that was actually used in configuration if proxy.tunnel_port != config.local_registry_port: config.local_registry_port = proxy.tunnel_port experiment_run_folders = [] # List of local directories used by experiment's runs try: # run socat if on Windows or Mac OS if get_current_os() in (OS.WINDOWS, OS.MACOS): # noinspection PyBroadException try: with spinner(text=Texts.CLUSTER_CONNECTION_MSG): socat.start(proxy.tunnel_port) except Exception: error_msg = Texts.LOCAL_DOCKER_TUNNEL_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) cluster_registry_port = get_app_service_node_port(nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY) # prepare environments for all experiment's runs for experiment_run in runs_list: if script_parameters and experiment_run.parameters: current_script_parameters = script_parameters + experiment_run.parameters elif script_parameters: current_script_parameters = script_parameters elif experiment_run.parameters: current_script_parameters = experiment_run.parameters else: current_script_parameters = "" run_folder, script_location, pod_count = \ prepare_experiment_environment(experiment_name=experiment_name, run_name=experiment_run.name, local_script_location=script_location, script_folder_location=script_folder_location, # noqa: E501 script_parameters=current_script_parameters, pack_type=template, pack_params=pack_params, local_registry_port=proxy.tunnel_port, cluster_registry_port=cluster_registry_port, env_variables=env_variables, requirements_file=requirements_file) # Set correct pod count if not pod_count or pod_count < 1: raise SubmitExperimentError('Unable to determine pod count: make sure that values.yaml ' 'file in your pack has podCount field with positive integer value.') experiment_run.pod_count = pod_count experiment_run_folders.append(run_folder) script_name = None if script_location is not None: script_name = os.path.basename(script_location) # Prepend script_name parameter to run description only for display purposes. experiment_run.parameters = script_parameters if not experiment_run.parameters \ else experiment_run.parameters + script_parameters if experiment_run.parameters and script_name: experiment_run.parameters = (script_name, ) + experiment_run.parameters elif script_name: experiment_run.parameters = (script_name, ) except SubmitExperimentError as e: log.exception(Texts.ENV_CREATION_ERROR_MSG) e.message += f' {Texts.ENV_CREATION_ERROR_MSG}' raise except Exception: # any error in this step breaks execution of this command message = Texts.ENV_CREATION_ERROR_MSG log.exception(message) # just in case - remove folders that were created with a success for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) # if ps or pr option is used - first ask whether experiment(s) should be submitted if parameter_range or parameter_set: click.echo(Texts.CONFIRM_SUBMIT_MSG) click.echo(tabulate({RUN_NAME: [run.name for run in runs_list], RUN_PARAMETERS: ["\n".join(run.parameters) if run.parameters else "" for run in runs_list]}, headers=[RUN_NAME, RUN_PARAMETERS], tablefmt="orgtbl")) if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True): for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) exit() # create Experiment model # TODO template_name & template_namespace should be filled after Template implementation parameter_range_spec = [f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range] parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set] experiment_parameters_spec = list(script_parameters) + parameter_range_spec + parameter_set_spec experiment = experiments_model.Experiment(name=experiment_name, template_name=template, parameters_spec=experiment_parameters_spec, template_namespace="template-namespace") experiment.create(namespace=namespace, labels=labels) # submit runs run_errors = {} for run, run_folder in zip(runs_list, experiment_run_folders): try: run.state = RunStatus.QUEUED with spinner(text=Texts.CREATING_RESOURCES_MSG.format(run_name=run.name)): # Add Run object with runKind label and pack params as annotations run.create(namespace=namespace, labels={'runKind': run_kind.value}, annotations={pack_param_name: pack_param_value for pack_param_name, pack_param_value in pack_params}) submitted_runs.append(run) submit_draft_pack(run_folder, namespace=namespace) except Exception as exe: delete_environment(run_folder) try: run.state = RunStatus.FAILED run_errors[run.name] = str(exe) run.update() except Exception as rexe: # update of non-existing run may fail log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(str(rexe))) # Delete experiment if no Runs were submitted if not submitted_runs: click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG) delete_k8s_object("experiment", experiment_name) # Change experiment status to submitted experiment.state = experiments_model.ExperimentStatus.SUBMITTED experiment.update() except LocalPortOccupiedError as exe: click.echo(exe.message) raise SubmitExperimentError(exe.message) except K8sProxyCloseError: log.exception('Error during closing of a proxy for a {}'.format(NAUTAAppNames.DOCKER_REGISTRY)) raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG) except K8sProxyOpenError: error_msg = Texts.PROXY_OPEN_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) except SubmitExperimentError: raise except Exception as exe: error_msg = Texts.SUBMIT_OTHER_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) from exe finally: with spinner(text=Texts.CLUSTER_CONNECTION_CLOSING_MSG): # noinspection PyBroadException try: socat.stop() except Exception: log.exception("Error during closing of a proxy for a local docker-host tunnel") raise K8sProxyCloseError(Texts.DOCKER_TUNNEL_CLOSE_ERROR_MSG) # remove semaphores from all exp folders remove_sempahore(experiment_name) log.debug("Submit - finish") return runs_list, run_errors, script_location
def get_logs(operation_name: str, start_date: str, end_date: str, match: str, output: bool, pager: bool, follow: bool): """ Show logs for a given model export operation. """ # check whether we have operations with a given name if operation_name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG) exit(1) elif not operation_name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG) exit(1) try: with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False) namespace = get_kubectl_current_context_namespace() if match: operation_name = match name_filter = match else: name_filter = f'^{operation_name}$' workflows = ArgoWorkflow.list(namespace=namespace, name_filter=name_filter) if not workflows: raise ValueError( f'Operation with given name: {operation_name} does not ' f'exists in namespace {namespace}.') follow_logs = True if follow and not output else False if output and len(workflows) > 1: click.echo(Texts.MORE_EXP_LOGS_MESSAGE) for workflow in workflows: start_date = start_date if start_date else workflow.started_at ops_logs_generator = es_client.get_argo_workflow_logs_generator( workflow=workflow, namespace=namespace, start_date=start_date, end_date=end_date, follow=follow_logs) if output: save_logs_to_file(logs_generator=ops_logs_generator, instance_name=workflow.name, instance_type="operation") else: if len(workflows) > 1: click.echo(f'Operation : {workflow.name}') print_logs(run_logs_generator=ops_logs_generator, pager=pager) except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG, Texts.PROXY_CLOSE_LOG_ERROR_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error( logger, Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format( exception_message=exe.message), Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format( exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG, Texts.PROXY_CREATION_ERROR_MSG) exit(1) except ValueError: handle_error( logger, Texts.OPERATION_NOT_EXISTS_ERROR_MSG.format( operation_name=operation_name), Texts.OPERATION_NOT_EXISTS_ERROR_MSG.format( experiment_name=operation_name)) exit(1) except Exception: handle_error(logger, Texts.LOGS_GET_OTHER_ERROR_MSG, Texts.LOGS_GET_OTHER_ERROR_MSG) exit(1)
def get_logs(experiment_name: str, min_severity: SeverityLevel, start_date: str, end_date: str, pod_ids: str, pod_status: PodStatus, match: str, output: bool, pager: bool, follow: bool, runs_kinds: List[RunKinds], instance_type: str): """ Show logs for a given experiment. """ # check whether we have runs with a given name if experiment_name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG.format(instance_type=instance_type)) exit(1) elif not experiment_name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG.format(instance_type=instance_type)) exit(1) try: with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False) namespace = get_kubectl_current_context_namespace() if match: experiment_name = match name_filter = match else: name_filter = f'^{experiment_name}$' runs = Run.list(namespace=namespace, name_filter=name_filter, run_kinds_filter=runs_kinds) if not runs: raise ValueError(f'Run with given name: {experiment_name} does not exists in namespace {namespace}.') pod_ids = pod_ids.split(',') if pod_ids else None min_severity = SeverityLevel[min_severity] if min_severity else None pod_status = PodStatus[pod_status] if pod_status else None follow_logs = True if follow and not output else False if output and len(runs) > 1: click.echo(Texts.MORE_EXP_LOGS_MESSAGE) for run in runs: start_date = start_date if start_date else run.creation_timestamp run_logs_generator = es_client.get_experiment_logs_generator(run=run, namespace=namespace, min_severity=min_severity, start_date=start_date, end_date=end_date, pod_ids=pod_ids, pod_status=pod_status, follow=follow_logs) if output: save_logs_to_file(run=run, run_logs_generator=run_logs_generator, instance_type=instance_type) else: if len(runs) > 1: click.echo(f'Experiment : {run.name}') print_logs(run_logs_generator=run_logs_generator, pager=pager) except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG, Texts.PROXY_CLOSE_USER_ERROR_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error(logger, Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message), Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG, Texts.PROXY_CREATION_ERROR_MSG) exit(1) except ValueError: handle_error(logger, Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(experiment_name=experiment_name, instance_type=instance_type.capitalize()), Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(experiment_name=experiment_name, instance_type=instance_type.capitalize())) exit(1) except Exception: handle_error(logger, Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type), Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type)) exit(1)
def cancel(state: State, name: str, match: str, purge: bool, pod_ids: str, pod_status: str, listed_runs_kinds: List[RunKinds] = None): """ Cancels chosen experiments based on a name provided as a parameter. """ if not listed_runs_kinds: listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER] # check whether we have runs with a given name if name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG) exit(1) if not name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG) exit(1) current_namespace = get_current_namespace() if pod_ids or pod_status: if not name: name = match cancel_pods_mode(namespace=current_namespace, run_name=name, pod_ids=pod_ids, pod_status=pod_status) exit(0) search_for_experiment = False exp_to_be_cancelled = None if name: exp_to_be_cancelled = Experiment.get(namespace=current_namespace, name=name) exp_to_be_cancelled_kind = RunKinds(exp_to_be_cancelled.metadata['labels'].get('runKind')) \ if exp_to_be_cancelled else None exp_to_be_cancelled = exp_to_be_cancelled if exp_to_be_cancelled_kind in listed_runs_kinds else None if exp_to_be_cancelled: search_for_experiment = True else: name = f"^{name}$" else: name = match list_of_all_runs = None list_of_applicable_states = [RunStatus.QUEUED, RunStatus.RUNNING] if purge: list_of_applicable_states.extend( [RunStatus.FAILED, RunStatus.COMPLETE, RunStatus.CANCELLED]) try: if search_for_experiment: list_of_all_runs = Run.list(namespace=current_namespace, exp_name_filter=[name], run_kinds_filter=listed_runs_kinds) else: list_of_all_runs = Run.list(namespace=current_namespace, name_filter=name, run_kinds_filter=listed_runs_kinds) except Exception: handle_error( logger, Texts.LIST_RUNS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural), Texts.LIST_RUNS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural)) exit(1) # Handle cancellation of experiments with no associated Runs if exp_to_be_cancelled and not list_of_all_runs: cancel_uninitialized_experiment(experiment=exp_to_be_cancelled, namespace=current_namespace, purge=purge) if not list_of_all_runs: handle_error(user_msg=Texts.LACK_OF_EXPERIMENTS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural, experiment_name=experiment_name)) exit(1) elif not purge and not [ run for run in list_of_all_runs if run.state in [RunStatus.QUEUED, RunStatus.RUNNING] ]: handle_error( user_msg=Texts.LACK_OF_EXP_TO_BE_CANCELLED_ERROR_MSG.format( experiment_name_plural=experiment_name_plural)) exit(1) # check whether we have at least one experiment in state other than CANCELLED list_of_runs_to_be_deleted: List[Run] = [] names_of_cancelled_runs: List[str] = [] if not purge: # check whether we have at least one experiment in state other than CANCELLED for run in list_of_all_runs: if run.state in list_of_applicable_states: list_of_runs_to_be_deleted.append(run) else: names_of_cancelled_runs.append(run.name) if not list_of_runs_to_be_deleted: handle_error( user_msg=Texts.EXPERIMENTS_ALREADY_CANCELLED_ERROR_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts. DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) exit(1) elif len(list_of_runs_to_be_deleted) != len(list_of_all_runs): click.echo( Texts.ALREADY_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for name in names_of_cancelled_runs: click.echo(f" - {name}") click.echo( Texts.CAN_BE_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") else: click.echo( Texts.WILL_BE_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") else: list_of_runs_to_be_deleted = list_of_all_runs click.echo( Texts.WILL_BE_PURGED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") if not click.confirm( Texts.CONFIRM_CANCEL_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deletion"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancellation"])): handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts. DELETE_OPERATION["deletion"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancellation"])) exit(0) # group runs by experiments exp_with_runs: defaultdict = defaultdict(list) for run in list_of_runs_to_be_deleted: exp_with_runs[run.experiment_name].append(run) deleted_runs = [] not_deleted_runs = [] if purge: # Connect to elasticsearch in order to purge run logs try: with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient( host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False, with_admin_privledges=is_current_user_administrator()) for exp_name, run_list in exp_with_runs.items(): try: exp_del_runs, exp_not_del_runs = purge_experiment( exp_name=exp_name, runs_to_purge=run_list, namespace=current_namespace, k8s_es_client=es_client) deleted_runs.extend(exp_del_runs) not_deleted_runs.extend(exp_not_del_runs) except Exception: handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG) not_deleted_runs.extend(run_list) except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSING_ERROR_LOG_MSG, Texts.PROXY_CLOSING_ERROR_USER_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error( logger, Texts.PORT_OCCUPIED_ERROR_LOG_MSG, Texts.PORT_OCCUPIED_ERROR_USER_MSG.format( exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_OPEN_ERROR_MSG, Texts.PROXY_OPEN_ERROR_MSG) exit(1) else: for exp_name, run_list in exp_with_runs.items(): try: exp_del_runs, exp_not_del_runs = cancel_experiment( exp_name=exp_name, runs_to_cancel=run_list, namespace=current_namespace) deleted_runs.extend(exp_del_runs) not_deleted_runs.extend(exp_not_del_runs) except Exception: handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG) not_deleted_runs.extend(run_list) if deleted_runs: click.echo( Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in deleted_runs: click.echo(f" - {run.name}") if not_deleted_runs: click.echo( Texts.FAILED_TO_CANCEL_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in not_deleted_runs: click.echo(f" - {run.name}") sys.exit(1)
def create(state: State, username: str, list_only: bool, filename: str): """ Adds a new user with a name given as a parameter. :param username: name of a new user """ if list_only and filename: handle_error(user_msg=Texts.F_L_OPTIONS_EXCLUSION_ERROR_MSG) exit(1) try: try: validate_user_name(username) except ValueError as exe: handle_error( logger, Texts.NAME_VALIDATION_ERROR_MSG.format(username=username), str(exe), add_verbosity_msg=state.verbosity == 0) exit(1) user_state = check_users_presence(username) if user_state == UserState.ACTIVE: handle_error( logger, Texts.USER_ALREADY_EXISTS_ERROR_MSG.format(username=username), Texts.USER_ALREADY_EXISTS_ERROR_MSG.format(username=username)) exit(1) if user_state == UserState.TERMINATING: handle_error( logger, Texts.USER_BEING_REMOVED_ERROR_MSG.format(username=username), Texts.USER_BEING_REMOVED_ERROR_MSG.format(username=username)) exit(1) except Exception: handle_error( logger, Texts.USER_VERIFICATION_ERROR_MSG.format(username=username), Texts.USER_VERIFICATION_ERROR_MSG.format(username=username), add_verbosity_msg=state.verbosity == 0) exit(1) try: with spinner(text=Texts.CREATING_USER_PROGRESS_MSG.format( username=username)): chart_location = os.path.join(Config().config_path, ADD_USER_CHART_NAME) nauta_config_map = NAUTAConfigMap() tiller_location = nauta_config_map.image_tiller tensorboard_service_location = nauta_config_map.image_tensorboard_service add_user_command = [ "helm", "install", "--wait", "--namespace", username, "--name", username, chart_location, "--set", "global.nauta=nauta", "--set", f"username={username}", "--set", "TillerImage={}".format(tiller_location), "--set", f"TensorboardServiceImage={tensorboard_service_location}" ] env = os.environ.copy() env['PATH'] = Config().config_path + os.pathsep + env['PATH'] _, err_code, log_output = execute_system_command( ' '.join(add_user_command), env=env, shell=True) if err_code: handle_error(logger, log_output, Texts.USER_ADD_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) if not delete_user(username): handle_error(user_msg=Texts.REMOVE_USER_ERROR_MSG.format( username=username)) sys.exit(1) try: users_password = get_users_token(username) except Exception: handle_error(logger, Texts.PASSWORD_GATHER_ERROR_MSG, Texts.PASSWORD_GATHER_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) users_password = "" try: cert = get_certificate(username) except Exception: handle_error(logger, Texts.CERT_GATHER_ERROR_MSG, Texts.CERT_GATHER_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) cert = "" try: with K8sProxy( NAUTAAppNames.GIT_REPO_MANAGER, number_of_retries_wait_for_readiness=60) as proxy: grm_client = GitRepoManagerClient(host='127.0.0.1', port=proxy.tunnel_port) grm_client.add_nauta_user(username=username) except Exception: handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG, Texts.GIT_REPO_MANAGER_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) sys.exit(1) except Exception: handle_error(logger, Texts.USER_ADD_ERROR_MSG.format(username=username), Texts.USER_ADD_ERROR_MSG.format(username=username), add_verbosity_msg=state.verbosity == 0) if not delete_user(username): handle_error(user_msg=Texts.REMOVE_USER_ERROR_MSG.format( username=username)) sys.exit(1) if is_user_created(username, 90): click.echo(Texts.USER_CREATION_SUCCESS_MSG.format(username=username)) else: # if during 90 seconds a user hasn't been created - app displays information about it # but doesn't stop processing the command - config file generated here may be useful later # when user has been created click.echo(Texts.USER_NOT_READY_ERROR_MSG.format(username=username)) try: kubeconfig = generate_kubeconfig(username, username, get_kubectl_host(), users_password, cert) except Exception: handle_error(logger, Texts.CONFIG_CREATION_ERROR_MSG, Texts.CONFIG_CREATION_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) if list_only: click.echo(Texts.LIST_ONLY_HEADER) click.echo(kubeconfig) else: if not filename: filename = DEFAULT_FILENAME.format(username) try: with open(filename, "w") as file: file.write(kubeconfig) click.echo(Texts.CONFIG_SAVE_SUCCESS_MSG.format(filename=filename)) except Exception: handle_error(logger, Texts.CONFIG_SAVE_FAIL_MSG, Texts.CONFIG_SAVE_FAIL_MSG, add_verbosity_msg=state.verbosity == 0) click.echo(Texts.CONFIG_SAVE_FAIL_INSTRUCTIONS_MSG) click.echo(kubeconfig) sys.exit(1)