예제 #1
0
def test_wait_for_connection_readiness(mocker):
    mocker.patch('requests.get')
    fake_address = 'localhost'
    fake_port = 1234

    # noinspection PyProtectedMember
    K8sProxy._wait_for_connection_readiness(fake_address, fake_port)

    # noinspection PyUnresolvedReferences
    requests.get.assert_called_once_with(f'http://{fake_address}:{fake_port}')
예제 #2
0
def test_wait_for_connection_readiness_many_tries_failure(mocker):
    fake_address = 'localhost'
    fake_port = 1234

    mocker.patch('requests.get', side_effect=ConnectionError)
    mocker.patch('time.sleep')

    with pytest.raises(TunnelSetupError):
        # noinspection PyProtectedMember
        K8sProxy._wait_for_connection_readiness(fake_address, fake_port, 15)

    # noinspection PyUnresolvedReferences
    assert requests.get.call_count == 15
예제 #3
0
def test_wait_for_connection_readiness_many_tries(mocker):
    effect = [ConnectionError for _ in range(10)]
    # noinspection PyTypeChecker
    effect.append(None)
    fake_address = 'localhost'
    fake_port = 1234

    mocker.patch('requests.get', side_effect=effect)
    mocker.patch('time.sleep')

    # noinspection PyProtectedMember
    K8sProxy._wait_for_connection_readiness(fake_address, fake_port, 15)

    # noinspection PyUnresolvedReferences
    assert requests.get.call_count == 11
예제 #4
0
def tensorboard(state: State, no_launch: bool,
                tensorboard_service_client_port: Optional[int],
                port_number: Optional[int], experiment_name: List[str]):
    """ Subcommand for launching tensorboard with credentials """
    current_namespace = get_kubectl_current_context_namespace()

    with spinner(Texts.TB_WAITING_MSG) as proxy_spinner, \
            K8sProxy(nauta_app_name=NAUTAAppNames.TENSORBOARD_SERVICE, app_name='tensorboard-service',
                     namespace=current_namespace, port=tensorboard_service_client_port) as proxy:

        tensorboard_service_client = TensorboardServiceClient(
            address=f'http://127.0.0.1:{proxy.tunnel_port}')

        requested_runs = build_tensorboard_run_list(
            exp_list=experiment_name, current_namespace=current_namespace)

        # noinspection PyBroadException
        try:
            tb = tensorboard_service_client.create_tensorboard(requested_runs)
            if tb.invalid_runs:
                list_of_invalid_runs = ', '.join([
                    f'{item.get("owner")}/{item.get("name")}'
                    for item in tb.invalid_runs
                ])
                click.echo(
                    Texts.TB_INVALID_RUNS_MSG.format(
                        invalid_runs=list_of_invalid_runs))
        except Exception as exe:
            err_message = Texts.TB_CREATE_ERROR_MSG
            if hasattr(
                    exe, 'error_code'
            ) and exe.error_code == HTTPStatus.UNPROCESSABLE_ENTITY:  # type: ignore
                err_message = str(exe)
            handle_error(logger,
                         err_message,
                         err_message,
                         add_verbosity_msg=state.verbosity == 0)
            sys.exit(1)

        for i in range(TENSORBOARD_TRIES_COUNT):
            # noinspection PyTypeChecker
            # tb.id is str
            tb = tensorboard_service_client.get_tensorboard(tb.id)
            if not tb:
                continue
            if tb.status == TensorboardStatus.RUNNING:
                proxy_spinner.hide()
                launch_app_with_proxy(k8s_app_name=NAUTAAppNames.TENSORBOARD,
                                      no_launch=no_launch,
                                      namespace=current_namespace,
                                      port=port_number,
                                      app_name=f"tensorboard-{tb.id}")
                return
            logger.warning(
                Texts.TB_WAITING_FOR_TB_MSG.format(
                    tb_id=tb.id, tb_status_value=tb.status.value))
            sleep(TENSORBOARD_CHECK_BACKOFF_SECONDS)

        click.echo(Texts.TB_TIMEOUT_ERROR_MSG)
        sys.exit(2)
예제 #5
0
def upgrade(ctx: click.Context):
    """
    Upgrade users after Nauta upgrade.
    """

    with spinner(text=Texts.UPGRADE_IN_PROGRESS):
        # noinspection PyBroadException
        try:
            # noinspection PyTypeChecker
            users: List[User] = User.list()

            with K8sProxy(NAUTAAppNames.GIT_REPO_MANAGER,
                          number_of_retries_wait_for_readiness=60) as proxy:
                grm_client = GitRepoManagerClient(host='127.0.0.1',
                                                  port=proxy.tunnel_port)

                for user in users:
                    grm_user = grm_client.get_user(user.name)
                    if not grm_user:
                        grm_client.add_nauta_user(user.name)
        except Exception:
            handle_error(logger,
                         Texts.UPGRADE_FAILED,
                         Texts.UPGRADE_FAILED,
                         add_verbosity_msg=ctx.obj.verbosity == 0)
            sys.exit(1)

    click.echo(Texts.UPGRADE_SUCCEEDED)
예제 #6
0
파일: logs.py 프로젝트: yuanbw/nauta
def logs(state: State, workflow_name: str):
    try:
        namespace = get_kubectl_current_context_namespace()
        workflow: ArgoWorkflow = ArgoWorkflow.get(namespace=namespace, name=workflow_name)
        if not workflow:
            click.echo(Texts.NOT_FOUND_MSG.format(workflow_name=workflow_name))
            exit(0)

        with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy:
            es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port,
                                               verify_certs=False, use_ssl=False)
            start_date = workflow.started_at
            workflow_logs_generator = es_client.get_argo_workflow_logs_generator(workflow=workflow,
                                                                                 namespace=namespace,
                                                                                 start_date=start_date)
            for log_entry in workflow_logs_generator:
                if not log_entry.content.isspace():
                    click.echo(f'{log_entry.date} {log_entry.pod_name} {log_entry.content}')
    except K8sProxyCloseError:
        handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG, Texts.PROXY_CLOSE_USER_ERROR_MSG)
        exit(1)
    except LocalPortOccupiedError as exe:
        handle_error(logger, Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message),
                     Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message))
        exit(1)
    except K8sProxyOpenError:
        handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG, Texts.PROXY_CREATION_ERROR_MSG)
        exit(1)
    except Exception:
        handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True)
        exit(1)
예제 #7
0
파일: create.py 프로젝트: yuanbw/nauta
def add_user_to_git_repo_manager(username: str, state):
    try:
        with K8sProxy(NAUTAAppNames.GIT_REPO_MANAGER, number_of_retries_wait_for_readiness=30) as proxy:
            grm_client = GitRepoManagerClient(host='127.0.0.1', port=proxy.tunnel_port)
            grm_client.add_nauta_user(username=username)
    except Exception:
        handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG, Texts.GIT_REPO_MANAGER_ERROR_MSG,
                     add_verbosity_msg=state.verbosity == 0)
        raise
예제 #8
0
파일: launcher.py 프로젝트: yuanbw/nauta
def launch_app(k8s_app_name: NAUTAAppNames,
               no_launch: bool = False,
               port: int = None,
               app_name: str = None,
               number_of_retries: int = 0,
               url_end: str = "",
               namespace: str = None):
    try:
        with spinner(text=Texts.LAUNCHING_APP_MSG) as proxy_spinner, \
             K8sProxy(nauta_app_name=k8s_app_name, port=port, app_name=app_name,
                      number_of_retries=number_of_retries, namespace=namespace) as proxy:
            url = FORWARDED_URL.format(proxy.tunnel_port, url_end)

            if k8s_app_name == NAUTAAppNames.INGRESS:
                config.load_kube_config()
                user_token = configuration.Configuration().api_key.get(
                    'authorization')
                prepared_user_token = user_token.replace('Bearer ', '')
                url = f'{url}?token={prepared_user_token}'

            if not no_launch:

                if is_gui_browser_available():
                    wait_for_connection(url)
                    webbrowser.open_new(url)
                    proxy_spinner.stop()
                else:
                    click.echo(Texts.NO_WEB_BROWSER_ERROR_MSG)

            if port and port != proxy.tunnel_port:
                click.echo(
                    Texts.CANNOT_USE_PORT.format(
                        required_port=port, random_port=proxy.tunnel_port))

            proxy_spinner.stop()
            click.echo(Texts.GO_TO_MSG.format(url=url))
            click.echo(Texts.PROXY_CREATED_MSG)
            wait_for_ctrl_c()
    except K8sProxyCloseError:
        err_message = Texts.PROXY_CLOSE_ERROR_MSG.format(app_name=k8s_app_name)
        raise ProxyClosingError(err_message)
    except LocalPortOccupiedError as exe:
        err_message = Texts.PROXY_CREATED_EXTENDED_ERROR_MSG.format(
            app_name=k8s_app_name, reason=exe.message)
        raise LaunchError(err_message)
    except K8sProxyOpenError:
        error_msg = Texts.PROXY_CREATED_ERROR_MSG.format(app_name=k8s_app_name)
        logger.exception(error_msg)
        raise LaunchError(error_msg)
    except LaunchError as e:
        raise e
    except Exception:
        err_message = Texts.WEB_APP_LAUCH_FAIL_MSG
        logger.exception(err_message)
        raise LaunchError(err_message)
예제 #9
0
def test_set_up_proxy(mocker):
    spo_mock = mocker.patch("subprocess.Popen")
    spf_mock = mocker.patch("util.k8s.k8s_proxy_context_manager.kubectl.start_port_forwarding",
                            return_value=(spo_mock, "1000", "1001"))
    mocker.patch("util.k8s.k8s_proxy_context_manager.K8sProxy._wait_for_connection_readiness")
    mocker.patch("psutil.Process")

    with K8sProxy(NAUTAAppNames.ELASTICSEARCH):
        pass

    assert spf_mock.call_count == 1
    # noinspection PyProtectedMember,PyUnresolvedReferences
    assert K8sProxy._wait_for_connection_readiness.call_count == 1
예제 #10
0
def test_set_up_proxy_open_failure(mocker):
    spf_mock = mocker.patch("util.k8s.k8s_proxy_context_manager.kubectl.start_port_forwarding",
                            side_effect=RuntimeError())
    spc_mock = mocker.patch("subprocess.Popen.kill", side_effect=RuntimeError())
    mocker.patch("util.k8s.k8s_proxy_context_manager.K8sProxy._wait_for_connection_readiness")
    with pytest.raises(K8sProxyOpenError):
        with K8sProxy(NAUTAAppNames.ELASTICSEARCH):
            pass

    assert spf_mock.call_count == 1
    assert spc_mock.call_count == 0
    # noinspection PyProtectedMember,PyUnresolvedReferences
    assert K8sProxy._wait_for_connection_readiness.call_count == 0
예제 #11
0
파일: docker.py 프로젝트: zhcf/nauta
def delete_images_for_experiment(exp_name: str):
    """
    Deletes image related to experiment with a given name.
    :param exp_name: name of an experiment for which image should be removed
    In case of any problems it raises an error
    """
    with K8sProxy(NAUTAAppNames.DOCKER_REGISTRY) as proxy:
        # Save port that was actually used in configuration
        server_address = f"127.0.0.1:{proxy.tunnel_port}"
        list_of_tags = get_tags_list(server_address=server_address,
                                     image_name=exp_name)

        for tag in list_of_tags:
            delete_tag(server_address=server_address,
                       image_name=exp_name,
                       tag=tag)
예제 #12
0
파일: common.py 프로젝트: hzjai0624/nauta
def _debug_workflow_logs(workflow: ArgoWorkflow, namespace: str):
    try:
        with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy:
            es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port,
                                               verify_certs=False, use_ssl=False)
            start_date = workflow.started_at
            workflow_logs_generator = es_client.get_argo_workflow_logs_generator(workflow=workflow,
                                                                                 namespace=namespace,
                                                                                 start_date=start_date)
            log.debug(f'=== Workflow {workflow.name} logs ===')
            for log_entry in workflow_logs_generator:
                if not log_entry.content.isspace():
                    log.debug(f'{log_entry.date} {log_entry.pod_name} {log_entry.content}')
            log.debug(f'=== Workflow {workflow.name} logs ===')
    except Exception:
        log.exception(f'Failed to get {workflow.name} worklfow logs.')
예제 #13
0
def test_set_up_proxy_open_readiness_failure(mocker):
    popen_mock = mocker.patch("subprocess.Popen")
    mocker.patch("subprocess.Popen.kill")
    mocker.patch("subprocess.Popen.terminate")
    mocker.patch("util.k8s.k8s_proxy_context_manager.kubectl.start_port_forwarding",
                 return_value=(popen_mock, 1000, 1001))
    mocker.patch("util.k8s.k8s_proxy_context_manager.K8sProxy._wait_for_connection_readiness",
                 side_effect=TunnelSetupError)
    mocker.patch("psutil.Process", return_value=mocker.MagicMock(children=lambda **kwargs: []))
    mocker.patch("psutil.wait_procs")

    with pytest.raises(K8sProxyOpenError):
        with K8sProxy(NAUTAAppNames.ELASTICSEARCH):
            pass

    # noinspection PyUnresolvedReferences
    assert subprocess.Popen.kill.call_count == 1 or subprocess.Popen.terminate.call_count == 1
예제 #14
0
def submit_experiment(template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING,
                      script_location: str = None, script_parameters: Tuple[str, ...] = None,
                      pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None,
                      parameter_set: Tuple[str, ...] = None,
                      script_folder_location: str = None,
                      env_variables: List[str] = None,
                      requirements_file: str = None) -> (List[Run], Dict[str, str], str):

    script_parameters = script_parameters if script_parameters else ()
    parameter_set = parameter_set if parameter_set else ()
    parameter_range = parameter_range if parameter_range else []

    log.debug("Submit experiment - start")
    try:
        namespace = get_kubectl_current_context_namespace()
        global submitted_namespace
        submitted_namespace = namespace
    except Exception:
        message = Texts.GET_NAMESPACE_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    try:
        with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG):
            experiment_name, labels = generate_exp_name_and_labels(script_name=script_location,
                                                                   namespace=namespace, name=name,
                                                                   run_kind=run_kind)
            runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range,
                                             parameter_set=parameter_set, template_name=template)
    except SubmitExperimentError as exe:
        log.exception(str(exe))
        raise exe
    except Exception:
        message = Texts.SUBMIT_PREPARATION_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    global submitted_experiment
    submitted_experiment = experiment_name

    # Ctrl-C handling
    signal.signal(signal.SIGINT, ctrl_c_handler_for_submit)
    signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit)

    try:
        config = Config()

        # start port forwarding
        # noinspection PyBroadException
        with K8sProxy(NAUTAAppNames.DOCKER_REGISTRY, port=config.local_registry_port) as proxy:
            # Save port that was actually used in configuration
            if proxy.tunnel_port != config.local_registry_port:
                config.local_registry_port = proxy.tunnel_port

            experiment_run_folders = []  # List of local directories used by experiment's runs
            try:
                # run socat if on Windows or Mac OS
                if get_current_os() in (OS.WINDOWS, OS.MACOS):
                    # noinspection PyBroadException
                    try:
                        with spinner(text=Texts.CLUSTER_CONNECTION_MSG):
                            socat.start(proxy.tunnel_port)
                    except Exception:
                        error_msg = Texts.LOCAL_DOCKER_TUNNEL_ERROR_MSG
                        log.exception(error_msg)
                        raise SubmitExperimentError(error_msg)

                cluster_registry_port = get_app_service_node_port(nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY)

                # prepare environments for all experiment's runs
                for experiment_run in runs_list:
                    if script_parameters and experiment_run.parameters:
                        current_script_parameters = script_parameters + experiment_run.parameters
                    elif script_parameters:
                        current_script_parameters = script_parameters
                    elif experiment_run.parameters:
                        current_script_parameters = experiment_run.parameters
                    else:
                        current_script_parameters = ""

                    run_folder, script_location, pod_count = \
                        prepare_experiment_environment(experiment_name=experiment_name,
                                                       run_name=experiment_run.name,
                                                       local_script_location=script_location,
                                                       script_folder_location=script_folder_location,  # noqa: E501
                                                       script_parameters=current_script_parameters,
                                                       pack_type=template, pack_params=pack_params,
                                                       local_registry_port=proxy.tunnel_port,
                                                       cluster_registry_port=cluster_registry_port,
                                                       env_variables=env_variables,
                                                       requirements_file=requirements_file)
                    # Set correct pod count
                    if not pod_count or pod_count < 1:
                        raise SubmitExperimentError('Unable to determine pod count: make sure that values.yaml '
                                                    'file in your pack has podCount field with positive integer value.')
                    experiment_run.pod_count = pod_count

                    experiment_run_folders.append(run_folder)
                    script_name = None
                    if script_location is not None:
                        script_name = os.path.basename(script_location)

                    # Prepend script_name parameter to run description only for display purposes.
                    experiment_run.parameters = script_parameters if not experiment_run.parameters \
                        else experiment_run.parameters + script_parameters
                    if experiment_run.parameters and script_name:
                        experiment_run.parameters = (script_name, ) + experiment_run.parameters
                    elif script_name:
                        experiment_run.parameters = (script_name, )
            except SubmitExperimentError as e:
                log.exception(Texts.ENV_CREATION_ERROR_MSG)
                e.message += f' {Texts.ENV_CREATION_ERROR_MSG}'
                raise
            except Exception:
                # any error in this step breaks execution of this command
                message = Texts.ENV_CREATION_ERROR_MSG
                log.exception(message)
                # just in case - remove folders that were created with a success
                for experiment_run_folder in experiment_run_folders:
                    delete_environment(experiment_run_folder)

            # if ps or pr option is used - first ask whether experiment(s) should be submitted
            if parameter_range or parameter_set:
                click.echo(Texts.CONFIRM_SUBMIT_MSG)
                click.echo(tabulate({RUN_NAME: [run.name for run in runs_list],
                                     RUN_PARAMETERS: ["\n".join(run.parameters) if run.parameters
                                                      else "" for run in runs_list]},
                                    headers=[RUN_NAME, RUN_PARAMETERS], tablefmt="orgtbl"))

                if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True):
                    for experiment_run_folder in experiment_run_folders:
                        delete_environment(experiment_run_folder)
                    exit()

            # create Experiment model
            # TODO template_name & template_namespace should be filled after Template implementation
            parameter_range_spec = [f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range]
            parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set]
            experiment_parameters_spec = list(script_parameters) + parameter_range_spec + parameter_set_spec
            experiment = experiments_model.Experiment(name=experiment_name, template_name=template,
                                                      parameters_spec=experiment_parameters_spec,
                                                      template_namespace="template-namespace")

            experiment.create(namespace=namespace, labels=labels)

            # submit runs
            run_errors = {}
            for run, run_folder in zip(runs_list, experiment_run_folders):
                try:
                    run.state = RunStatus.QUEUED
                    with spinner(text=Texts.CREATING_RESOURCES_MSG.format(run_name=run.name)):
                        # Add Run object with runKind label and pack params as annotations
                        run.create(namespace=namespace, labels={'runKind': run_kind.value},
                                   annotations={pack_param_name: pack_param_value
                                                for pack_param_name, pack_param_value in pack_params})
                        submitted_runs.append(run)
                        submit_draft_pack(run_folder, namespace=namespace)
                except Exception as exe:
                    delete_environment(run_folder)
                    try:
                        run.state = RunStatus.FAILED
                        run_errors[run.name] = str(exe)
                        run.update()
                    except Exception as rexe:
                        # update of non-existing run may fail
                        log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(str(rexe)))

            # Delete experiment if no Runs were submitted
            if not submitted_runs:
                click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG)
                delete_k8s_object("experiment", experiment_name)

            # Change experiment status to submitted
            experiment.state = experiments_model.ExperimentStatus.SUBMITTED
            experiment.update()
    except LocalPortOccupiedError as exe:
        click.echo(exe.message)
        raise SubmitExperimentError(exe.message)
    except K8sProxyCloseError:
        log.exception('Error during closing of a proxy for a {}'.format(NAUTAAppNames.DOCKER_REGISTRY))
        raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG)
    except K8sProxyOpenError:
        error_msg = Texts.PROXY_OPEN_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg)
    except SubmitExperimentError:
        raise
    except Exception as exe:
        error_msg = Texts.SUBMIT_OTHER_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg) from exe
    finally:
        with spinner(text=Texts.CLUSTER_CONNECTION_CLOSING_MSG):
            # noinspection PyBroadException
            try:
                socat.stop()
            except Exception:
                log.exception("Error during closing of a proxy for a local docker-host tunnel")
                raise K8sProxyCloseError(Texts.DOCKER_TUNNEL_CLOSE_ERROR_MSG)
        # remove semaphores from all exp folders
        remove_sempahore(experiment_name)

    log.debug("Submit - finish")
    return runs_list, run_errors, script_location
예제 #15
0
파일: common.py 프로젝트: pnijhara/nauta
def get_logs(operation_name: str, start_date: str, end_date: str, match: str,
             output: bool, pager: bool, follow: bool):
    """
    Show logs for a given model export operation.
    """
    # check whether we have operations with a given name
    if operation_name and match:
        handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG)
        exit(1)
    elif not operation_name and not match:
        handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG)
        exit(1)

    try:
        with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy:
            es_client = K8sElasticSearchClient(host="127.0.0.1",
                                               port=proxy.tunnel_port,
                                               verify_certs=False,
                                               use_ssl=False)
            namespace = get_kubectl_current_context_namespace()
            if match:
                operation_name = match
                name_filter = match
            else:
                name_filter = f'^{operation_name}$'
            workflows = ArgoWorkflow.list(namespace=namespace,
                                          name_filter=name_filter)
            if not workflows:
                raise ValueError(
                    f'Operation with given name: {operation_name} does not '
                    f'exists in namespace {namespace}.')

            follow_logs = True if follow and not output else False

            if output and len(workflows) > 1:
                click.echo(Texts.MORE_EXP_LOGS_MESSAGE)

            for workflow in workflows:
                start_date = start_date if start_date else workflow.started_at

                ops_logs_generator = es_client.get_argo_workflow_logs_generator(
                    workflow=workflow,
                    namespace=namespace,
                    start_date=start_date,
                    end_date=end_date,
                    follow=follow_logs)

                if output:
                    save_logs_to_file(logs_generator=ops_logs_generator,
                                      instance_name=workflow.name,
                                      instance_type="operation")
                else:
                    if len(workflows) > 1:
                        click.echo(f'Operation : {workflow.name}')
                    print_logs(run_logs_generator=ops_logs_generator,
                               pager=pager)

    except K8sProxyCloseError:
        handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG,
                     Texts.PROXY_CLOSE_LOG_ERROR_MSG)
        exit(1)
    except LocalPortOccupiedError as exe:
        handle_error(
            logger,
            Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(
                exception_message=exe.message),
            Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(
                exception_message=exe.message))
        exit(1)
    except K8sProxyOpenError:
        handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG,
                     Texts.PROXY_CREATION_ERROR_MSG)
        exit(1)
    except ValueError:
        handle_error(
            logger,
            Texts.OPERATION_NOT_EXISTS_ERROR_MSG.format(
                operation_name=operation_name),
            Texts.OPERATION_NOT_EXISTS_ERROR_MSG.format(
                experiment_name=operation_name))
        exit(1)
    except Exception:
        handle_error(logger, Texts.LOGS_GET_OTHER_ERROR_MSG,
                     Texts.LOGS_GET_OTHER_ERROR_MSG)
        exit(1)
예제 #16
0
def get_logs(experiment_name: str, min_severity: SeverityLevel, start_date: str,
             end_date: str, pod_ids: str, pod_status: PodStatus, match: str, output: bool, pager: bool, follow: bool,
             runs_kinds: List[RunKinds], instance_type: str):
    """
    Show logs for a given experiment.
    """
    # check whether we have runs with a given name
    if experiment_name and match:
        handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG.format(instance_type=instance_type))
        exit(1)
    elif not experiment_name and not match:
        handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG.format(instance_type=instance_type))
        exit(1)

    try:
        with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy:
            es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port,
                                               verify_certs=False, use_ssl=False)
            namespace = get_kubectl_current_context_namespace()
            if match:
                experiment_name = match
                name_filter = match
            else:
                name_filter = f'^{experiment_name}$'
            runs = Run.list(namespace=namespace, name_filter=name_filter, run_kinds_filter=runs_kinds)
            if not runs:
                raise ValueError(f'Run with given name: {experiment_name} does not exists in namespace {namespace}.')

            pod_ids = pod_ids.split(',') if pod_ids else None
            min_severity = SeverityLevel[min_severity] if min_severity else None
            pod_status = PodStatus[pod_status] if pod_status else None
            follow_logs = True if follow and not output else False

            if output and len(runs) > 1:
                click.echo(Texts.MORE_EXP_LOGS_MESSAGE)

            for run in runs:
                start_date = start_date if start_date else run.creation_timestamp

                run_logs_generator = es_client.get_experiment_logs_generator(run=run, namespace=namespace,
                                                                             min_severity=min_severity,
                                                                             start_date=start_date, end_date=end_date,
                                                                             pod_ids=pod_ids, pod_status=pod_status,
                                                                             follow=follow_logs)

                if output:
                    save_logs_to_file(run=run, run_logs_generator=run_logs_generator, instance_type=instance_type)
                else:
                    if len(runs) > 1:
                        click.echo(f'Experiment : {run.name}')
                    print_logs(run_logs_generator=run_logs_generator, pager=pager)

    except K8sProxyCloseError:
        handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG, Texts.PROXY_CLOSE_USER_ERROR_MSG)
        exit(1)
    except LocalPortOccupiedError as exe:
        handle_error(logger, Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message),
                     Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message))
        exit(1)
    except K8sProxyOpenError:
        handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG, Texts.PROXY_CREATION_ERROR_MSG)
        exit(1)
    except ValueError:
        handle_error(logger, Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(experiment_name=experiment_name,
                                                                          instance_type=instance_type.capitalize()),
                     Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(experiment_name=experiment_name,
                                                                  instance_type=instance_type.capitalize()))
        exit(1)
    except Exception:
        handle_error(logger, Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type),
                     Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type))
        exit(1)
예제 #17
0
def cancel(state: State,
           name: str,
           match: str,
           purge: bool,
           pod_ids: str,
           pod_status: str,
           listed_runs_kinds: List[RunKinds] = None):
    """
    Cancels chosen experiments based on a name provided as a parameter.
    """
    if not listed_runs_kinds:
        listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER]

    # check whether we have runs with a given name
    if name and match:
        handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG)
        exit(1)

    if not name and not match:
        handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG)
        exit(1)

    current_namespace = get_current_namespace()

    if pod_ids or pod_status:
        if not name:
            name = match

        cancel_pods_mode(namespace=current_namespace,
                         run_name=name,
                         pod_ids=pod_ids,
                         pod_status=pod_status)
        exit(0)

    search_for_experiment = False
    exp_to_be_cancelled = None

    if name:
        exp_to_be_cancelled = Experiment.get(namespace=current_namespace,
                                             name=name)
        exp_to_be_cancelled_kind = RunKinds(exp_to_be_cancelled.metadata['labels'].get('runKind')) \
            if exp_to_be_cancelled else None
        exp_to_be_cancelled = exp_to_be_cancelled if exp_to_be_cancelled_kind in listed_runs_kinds else None

        if exp_to_be_cancelled:
            search_for_experiment = True
        else:
            name = f"^{name}$"
    else:
        name = match

    list_of_all_runs = None

    list_of_applicable_states = [RunStatus.QUEUED, RunStatus.RUNNING]

    if purge:
        list_of_applicable_states.extend(
            [RunStatus.FAILED, RunStatus.COMPLETE, RunStatus.CANCELLED])

    try:
        if search_for_experiment:
            list_of_all_runs = Run.list(namespace=current_namespace,
                                        exp_name_filter=[name],
                                        run_kinds_filter=listed_runs_kinds)
        else:
            list_of_all_runs = Run.list(namespace=current_namespace,
                                        name_filter=name,
                                        run_kinds_filter=listed_runs_kinds)
    except Exception:
        handle_error(
            logger,
            Texts.LIST_RUNS_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural),
            Texts.LIST_RUNS_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural))
        exit(1)

    # Handle cancellation of experiments with no associated Runs
    if exp_to_be_cancelled and not list_of_all_runs:
        cancel_uninitialized_experiment(experiment=exp_to_be_cancelled,
                                        namespace=current_namespace,
                                        purge=purge)

    if not list_of_all_runs:
        handle_error(user_msg=Texts.LACK_OF_EXPERIMENTS_ERROR_MSG.format(
            experiment_name_plural=experiment_name_plural,
            experiment_name=experiment_name))
        exit(1)
    elif not purge and not [
            run for run in list_of_all_runs
            if run.state in [RunStatus.QUEUED, RunStatus.RUNNING]
    ]:
        handle_error(
            user_msg=Texts.LACK_OF_EXP_TO_BE_CANCELLED_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural))
        exit(1)

    # check whether we have at least one experiment in state other than CANCELLED
    list_of_runs_to_be_deleted: List[Run] = []
    names_of_cancelled_runs: List[str] = []

    if not purge:
        # check whether we have at least one experiment in state other than CANCELLED
        for run in list_of_all_runs:
            if run.state in list_of_applicable_states:
                list_of_runs_to_be_deleted.append(run)
            else:
                names_of_cancelled_runs.append(run.name)

        if not list_of_runs_to_be_deleted:
            handle_error(
                user_msg=Texts.EXPERIMENTS_ALREADY_CANCELLED_ERROR_MSG.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.
                    DELETE_OPERATION["deleted"] if experiment_name_plural ==
                    'pods' else Texts.CANCEL_OPERATION["cancelled"]))
            exit(1)
        elif len(list_of_runs_to_be_deleted) != len(list_of_all_runs):
            click.echo(
                Texts.ALREADY_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for name in names_of_cancelled_runs:
                click.echo(f"     - {name}")
            click.echo(
                Texts.CAN_BE_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for run in list_of_runs_to_be_deleted:
                click.echo(f"     - {run.name}")
        else:
            click.echo(
                Texts.WILL_BE_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for run in list_of_runs_to_be_deleted:
                click.echo(f"     - {run.name}")
    else:
        list_of_runs_to_be_deleted = list_of_all_runs
        click.echo(
            Texts.WILL_BE_PURGED_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in list_of_runs_to_be_deleted:
            click.echo(f"     - {run.name}")

    if not click.confirm(
            Texts.CONFIRM_CANCEL_MSG.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deletion"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancellation"])):
        handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format(
            experiment_name_plural=experiment_name_plural,
            operation_word=Texts.
            DELETE_OPERATION["deletion"] if experiment_name_plural ==
            'pods' else Texts.CANCEL_OPERATION["cancellation"]))
        exit(0)

    # group runs by experiments
    exp_with_runs: defaultdict = defaultdict(list)

    for run in list_of_runs_to_be_deleted:
        exp_with_runs[run.experiment_name].append(run)

    deleted_runs = []
    not_deleted_runs = []

    if purge:
        # Connect to elasticsearch in order to purge run logs
        try:
            with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy:
                es_client = K8sElasticSearchClient(
                    host="127.0.0.1",
                    port=proxy.tunnel_port,
                    verify_certs=False,
                    use_ssl=False,
                    with_admin_privledges=is_current_user_administrator())
                for exp_name, run_list in exp_with_runs.items():
                    try:
                        exp_del_runs, exp_not_del_runs = purge_experiment(
                            exp_name=exp_name,
                            runs_to_purge=run_list,
                            namespace=current_namespace,
                            k8s_es_client=es_client)
                        deleted_runs.extend(exp_del_runs)
                        not_deleted_runs.extend(exp_not_del_runs)
                    except Exception:
                        handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG)
                        not_deleted_runs.extend(run_list)
        except K8sProxyCloseError:
            handle_error(logger, Texts.PROXY_CLOSING_ERROR_LOG_MSG,
                         Texts.PROXY_CLOSING_ERROR_USER_MSG)
            exit(1)
        except LocalPortOccupiedError as exe:
            handle_error(
                logger, Texts.PORT_OCCUPIED_ERROR_LOG_MSG,
                Texts.PORT_OCCUPIED_ERROR_USER_MSG.format(
                    exception_message=exe.message))
            exit(1)
        except K8sProxyOpenError:
            handle_error(logger, Texts.PROXY_OPEN_ERROR_MSG,
                         Texts.PROXY_OPEN_ERROR_MSG)
            exit(1)
    else:
        for exp_name, run_list in exp_with_runs.items():
            try:
                exp_del_runs, exp_not_del_runs = cancel_experiment(
                    exp_name=exp_name,
                    runs_to_cancel=run_list,
                    namespace=current_namespace)
                deleted_runs.extend(exp_del_runs)
                not_deleted_runs.extend(exp_not_del_runs)
            except Exception:
                handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG)
                not_deleted_runs.extend(run_list)

    if deleted_runs:
        click.echo(
            Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in deleted_runs:
            click.echo(f"     - {run.name}")

    if not_deleted_runs:
        click.echo(
            Texts.FAILED_TO_CANCEL_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in not_deleted_runs:
            click.echo(f"     - {run.name}")
        sys.exit(1)
예제 #18
0
파일: create.py 프로젝트: hzjai0624/nauta
def create(state: State, username: str, list_only: bool, filename: str):
    """
    Adds a new user with a name given as a parameter.

    :param username: name of a new user
    """

    if list_only and filename:
        handle_error(user_msg=Texts.F_L_OPTIONS_EXCLUSION_ERROR_MSG)
        exit(1)

    try:
        try:
            validate_user_name(username)
        except ValueError as exe:
            handle_error(
                logger,
                Texts.NAME_VALIDATION_ERROR_MSG.format(username=username),
                str(exe),
                add_verbosity_msg=state.verbosity == 0)
            exit(1)

        user_state = check_users_presence(username)

        if user_state == UserState.ACTIVE:
            handle_error(
                logger,
                Texts.USER_ALREADY_EXISTS_ERROR_MSG.format(username=username),
                Texts.USER_ALREADY_EXISTS_ERROR_MSG.format(username=username))
            exit(1)

        if user_state == UserState.TERMINATING:
            handle_error(
                logger,
                Texts.USER_BEING_REMOVED_ERROR_MSG.format(username=username),
                Texts.USER_BEING_REMOVED_ERROR_MSG.format(username=username))
            exit(1)

    except Exception:
        handle_error(
            logger,
            Texts.USER_VERIFICATION_ERROR_MSG.format(username=username),
            Texts.USER_VERIFICATION_ERROR_MSG.format(username=username),
            add_verbosity_msg=state.verbosity == 0)
        exit(1)

    try:
        with spinner(text=Texts.CREATING_USER_PROGRESS_MSG.format(
                username=username)):
            chart_location = os.path.join(Config().config_path,
                                          ADD_USER_CHART_NAME)

            nauta_config_map = NAUTAConfigMap()

            tiller_location = nauta_config_map.image_tiller
            tensorboard_service_location = nauta_config_map.image_tensorboard_service

            add_user_command = [
                "helm", "install", "--wait", "--namespace", username, "--name",
                username, chart_location, "--set", "global.nauta=nauta",
                "--set", f"username={username}", "--set",
                "TillerImage={}".format(tiller_location), "--set",
                f"TensorboardServiceImage={tensorboard_service_location}"
            ]
            env = os.environ.copy()
            env['PATH'] = Config().config_path + os.pathsep + env['PATH']
            _, err_code, log_output = execute_system_command(
                ' '.join(add_user_command), env=env, shell=True)

            if err_code:
                handle_error(logger,
                             log_output,
                             Texts.USER_ADD_ERROR_MSG,
                             add_verbosity_msg=state.verbosity == 0)

                if not delete_user(username):
                    handle_error(user_msg=Texts.REMOVE_USER_ERROR_MSG.format(
                        username=username))
                sys.exit(1)

            try:
                users_password = get_users_token(username)
            except Exception:
                handle_error(logger,
                             Texts.PASSWORD_GATHER_ERROR_MSG,
                             Texts.PASSWORD_GATHER_ERROR_MSG,
                             add_verbosity_msg=state.verbosity == 0)
                users_password = ""

            try:
                cert = get_certificate(username)
            except Exception:
                handle_error(logger,
                             Texts.CERT_GATHER_ERROR_MSG,
                             Texts.CERT_GATHER_ERROR_MSG,
                             add_verbosity_msg=state.verbosity == 0)
                cert = ""

            try:
                with K8sProxy(
                        NAUTAAppNames.GIT_REPO_MANAGER,
                        number_of_retries_wait_for_readiness=60) as proxy:
                    grm_client = GitRepoManagerClient(host='127.0.0.1',
                                                      port=proxy.tunnel_port)
                    grm_client.add_nauta_user(username=username)
            except Exception:
                handle_error(logger,
                             Texts.GIT_REPO_MANAGER_ERROR_MSG,
                             Texts.GIT_REPO_MANAGER_ERROR_MSG,
                             add_verbosity_msg=state.verbosity == 0)
                sys.exit(1)

    except Exception:
        handle_error(logger,
                     Texts.USER_ADD_ERROR_MSG.format(username=username),
                     Texts.USER_ADD_ERROR_MSG.format(username=username),
                     add_verbosity_msg=state.verbosity == 0)
        if not delete_user(username):
            handle_error(user_msg=Texts.REMOVE_USER_ERROR_MSG.format(
                username=username))
        sys.exit(1)

    if is_user_created(username, 90):
        click.echo(Texts.USER_CREATION_SUCCESS_MSG.format(username=username))
    else:
        # if during 90 seconds a user hasn't been created - app displays information about it
        # but doesn't stop processing the command - config file generated here may be useful later
        # when user has been created
        click.echo(Texts.USER_NOT_READY_ERROR_MSG.format(username=username))

    try:
        kubeconfig = generate_kubeconfig(username, username,
                                         get_kubectl_host(), users_password,
                                         cert)
    except Exception:
        handle_error(logger,
                     Texts.CONFIG_CREATION_ERROR_MSG,
                     Texts.CONFIG_CREATION_ERROR_MSG,
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)

    if list_only:
        click.echo(Texts.LIST_ONLY_HEADER)
        click.echo(kubeconfig)
    else:
        if not filename:
            filename = DEFAULT_FILENAME.format(username)
        try:
            with open(filename, "w") as file:
                file.write(kubeconfig)

            click.echo(Texts.CONFIG_SAVE_SUCCESS_MSG.format(filename=filename))
        except Exception:
            handle_error(logger,
                         Texts.CONFIG_SAVE_FAIL_MSG,
                         Texts.CONFIG_SAVE_FAIL_MSG,
                         add_verbosity_msg=state.verbosity == 0)
            click.echo(Texts.CONFIG_SAVE_FAIL_INSTRUCTIONS_MSG)
            click.echo(kubeconfig)
            sys.exit(1)