Пример #1
0
def purge_user(username: str):
    """
    Removes all system's artifacts that belong to a removed user.
    K8s objects are removed during removal of a namespace.
    :param username: name of a user for which artifacts should be removed
    It throws exception in case of any problems detected during removal of a user
    """
    try:
        # remove data from elasticsearch
        with spinner(text=TextsDel.DELETION_DELETING_USERS_EXPERIMENTS):
            es_client = K8sElasticSearchClient(
                host=f'{get_kubectl_host(with_port=True)}'
                f'/api/v1/namespaces/nauta/services/nauta-elasticsearch:nauta/proxy',
                verify_certs=False,
                use_ssl=True,
                headers={'Authorization': get_api_key()})
            es_client.delete_logs_for_namespace(username)

        # remove data from git repo manager
        with k8s_proxy_context_manager.K8sProxy(NAUTAAppNames.GIT_REPO_MANAGER) as proxy,\
                spinner(text=TextsDel.DELETION_DELETING_USERS_REPOSITORY):
            grm_client = GitRepoManagerClient(host='127.0.0.1',
                                              port=proxy.tunnel_port)
            grm_client.delete_nauta_user(username=username)
    except K8sProxyCloseError as exe:
        logger.exception("Error during closing of a proxy.")
        raise exe
    except Exception as exe:
        logger.exception(f"Error during removal of {username} user data")
        raise exe
Пример #2
0
def delete_user(username: str):
    """
    Removes a user with all his/her objects

    :param username: name of a user to be deleted
    Throws an excpetion in case of any errors
    """
    with spinner(text=TextsDel.DELETION_DELETING_NAMESPACE):
        delete_namespace(username)

    with spinner(text=TextsDel.DELETION_DELETING_USERS_OBJECTS):
        delete_helm_release(username, purge=True)
Пример #3
0
def tensorboard(state: State, no_launch: bool,
                tensorboard_service_client_port: Optional[int],
                port_number: Optional[int], experiment_name: List[str]):
    """ Subcommand for launching tensorboard with credentials """
    current_namespace = get_kubectl_current_context_namespace()

    with spinner(Texts.TB_WAITING_MSG) as proxy_spinner, \
            K8sProxy(nauta_app_name=NAUTAAppNames.TENSORBOARD_SERVICE, app_name='tensorboard-service',
                     namespace=current_namespace, port=tensorboard_service_client_port) as proxy:

        tensorboard_service_client = TensorboardServiceClient(
            address=f'http://127.0.0.1:{proxy.tunnel_port}')

        requested_runs = build_tensorboard_run_list(
            exp_list=experiment_name, current_namespace=current_namespace)

        # noinspection PyBroadException
        try:
            tb = tensorboard_service_client.create_tensorboard(requested_runs)
            if tb.invalid_runs:
                list_of_invalid_runs = ', '.join([
                    f'{item.get("owner")}/{item.get("name")}'
                    for item in tb.invalid_runs
                ])
                click.echo(
                    Texts.TB_INVALID_RUNS_MSG.format(
                        invalid_runs=list_of_invalid_runs))
        except Exception as exe:
            err_message = Texts.TB_CREATE_ERROR_MSG
            if hasattr(
                    exe, 'error_code'
            ) and exe.error_code == HTTPStatus.UNPROCESSABLE_ENTITY:  # type: ignore
                err_message = str(exe)
            handle_error(logger,
                         err_message,
                         err_message,
                         add_verbosity_msg=state.verbosity == 0)
            sys.exit(1)

        for i in range(TENSORBOARD_TRIES_COUNT):
            # noinspection PyTypeChecker
            # tb.id is str
            tb = tensorboard_service_client.get_tensorboard(tb.id)
            if not tb:
                continue
            if tb.status == TensorboardStatus.RUNNING:
                proxy_spinner.hide()
                launch_app_with_proxy(k8s_app_name=NAUTAAppNames.TENSORBOARD,
                                      no_launch=no_launch,
                                      namespace=current_namespace,
                                      port=port_number,
                                      app_name=f"tensorboard-{tb.id}")
                return
            logger.warning(
                Texts.TB_WAITING_FOR_TB_MSG.format(
                    tb_id=tb.id, tb_status_value=tb.status.value))
            sleep(TENSORBOARD_CHECK_BACKOFF_SECONDS)

        click.echo(Texts.TB_TIMEOUT_ERROR_MSG)
        sys.exit(2)
Пример #4
0
def save_logs_to_file(logs_generator: Generator[LogEntry, None, None],
                      instance_name: str, instance_type: str):
    filename = instance_name + ".log"
    confirmation_message = Texts.LOGS_STORING_CONF.format(
        filename=filename,
        instance_name=instance_name,
        instance_type=instance_type)
    if os.path.isfile(filename):
        confirmation_message = Texts.LOGS_STORING_CONF_FILE_EXISTS.format(
            filename=filename,
            instance_name=instance_name,
            instance_type=instance_type)

    if click.get_current_context().obj.force or click.confirm(
            confirmation_message, default=True):
        try:
            with open(filename, 'w') as file, spinner(
                    spinner=NctlSpinner,
                    text=Texts.SAVING_LOGS_TO_FILE_PROGRESS_MSG,
                    color=SPINNER_COLOR):
                for log_entry in logs_generator:
                    if not log_entry.content.isspace():
                        formatted_date = format_log_date(log_entry.date)
                        file.write(
                            f'{formatted_date} {log_entry.pod_name} {log_entry.content}'
                        )
            click.echo(Texts.LOGS_STORING_FINAL_MESSAGE)
        except Exception:
            handle_error(logger, Texts.LOGS_STORING_ERROR,
                         Texts.LOGS_STORING_ERROR)
            exit(1)
    else:
        click.echo(Texts.LOGS_STORING_CANCEL_MESSAGE)
Пример #5
0
def status(ctx: click.Context, username: str):
    """
    Returns status of a model

    :param username; if checked - searches for model for a certain user
    """
    try:
        workflows: List[ArgoWorkflow.ArgoWorkflowCliModel] = []
        if not username:
            namespace = get_kubectl_current_context_namespace()
        else:
            namespace = username
        with spinner(text=Texts.LOAD_DATA_MSG):
            # filtering out workflows used to build images with training jobs
            workflows = [
                workflow.cli_representation for workflow in ArgoWorkflow.list(
                    namespace=namespace, label_selector="type!=build-workflow")
            ]

        click.echo(
            tabulate(workflows,
                     headers=MODEL_HEADERS,
                     tablefmt=TBLT_TABLE_FORMAT))
    except Exception:
        handle_error(logger,
                     Texts.OTHER_ERROR_MSG,
                     Texts.OTHER_ERROR_MSG,
                     add_verbosity_msg=True)
        exit(1)
Пример #6
0
def save_logs_to_file(run: Run, run_logs_generator: Generator[LogEntry, None, None], instance_type: str):
    filename = run.name + '.log'
    confirmation_message = Texts.LOGS_STORING_CONFIRMATION.format(filename=filename,
                                                                  experiment_name=run.name,
                                                                  instance_type=instance_type)
    if os.path.isfile(filename):
        confirmation_message = Texts.LOGS_STORING_CONFIRMATION_FILE_EXISTS.format(filename=filename,
                                                                                  experiment_name=run.name,
                                                                                  instance_type=instance_type)

    if click.confirm(confirmation_message, default=True):
        try:
            with open(filename, 'w') as file, spinner(spinner=NctlSpinner,
                                                      text=Texts.SAVING_LOGS_TO_FILE_PROGRESS_MSG, color=SPINNER_COLOR):
                for log_entry in run_logs_generator:
                    if not log_entry.content.isspace():
                        formatted_date = format_log_date(log_entry.date)
                        file.write(f'{formatted_date} {log_entry.pod_name} {log_entry.content}')
            click.echo(Texts.LOGS_STORING_FINAL_MESSAGE)
        except Exception as exe:
            handle_error(logger,
                         Texts.LOGS_STORING_ERROR.format(exception_message=exe.message),
                         Texts.LOGS_STORING_ERROR.format(exception_message=exe.message))
            exit(1)
    else:
        click.echo(Texts.LOGS_STORING_CANCEL_MESSAGE)
Пример #7
0
def upgrade(ctx: click.Context):
    """
    Upgrade users after Nauta upgrade.
    """

    with spinner(text=Texts.UPGRADE_IN_PROGRESS):
        # noinspection PyBroadException
        try:
            # noinspection PyTypeChecker
            users: List[User] = User.list()

            with K8sProxy(NAUTAAppNames.GIT_REPO_MANAGER,
                          number_of_retries_wait_for_readiness=60) as proxy:
                grm_client = GitRepoManagerClient(host='127.0.0.1',
                                                  port=proxy.tunnel_port)

                for user in users:
                    grm_user = grm_client.get_user(user.name)
                    if not grm_user:
                        grm_client.add_nauta_user(user.name)
        except Exception:
            handle_error(logger,
                         Texts.UPGRADE_FAILED,
                         Texts.UPGRADE_FAILED,
                         add_verbosity_msg=ctx.obj.verbosity == 0)
            sys.exit(1)

    click.echo(Texts.UPGRADE_SUCCEEDED)
Пример #8
0
def status(state: State, model_name: str, status: PodPhase, username: str):
    """
    Returns status of a model

    :param model_name: name of a model data of which should be displayed
    :param status: status of a model step that should be displayed
    :param username; if checked - searches for model for a certain user
    """
    try:
        if not username:
            namespace = get_kubectl_current_context_namespace()
        else:
            namespace = username
        with spinner(text=Texts.LOAD_DATA_MSG):
            workflow: ArgoWorkflow = ArgoWorkflow.get(namespace=namespace, name=model_name)

        if not workflow:
            click.echo(Texts.MODEL_NOT_FOUND.format(model_name=model_name))
            exit(0)
        click.echo('\nOperation details:\n')
        click.echo(tabulate([workflow.cli_representation], headers=MODEL_HEADERS, tablefmt=TBLT_TABLE_FORMAT))
        click.echo('\nOperation steps:\n')
        if workflow.steps:
            click.echo(tabulate([step.cli_representation for step in workflow.steps
                                 if status is None or status == step.phase], headers=STEP_HEADERS,
                                tablefmt=TBLT_TABLE_FORMAT))
        else:
            click.echo(Texts.LACK_OF_STEPS)
    except Exception:
        handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True)
        exit(1)
Пример #9
0
def stream(state: State, name: str, data: str, method_verb: InferenceVerb):
    """
    Perform stream inference task on launched prediction instance.
    """
    method_verb = InferenceVerb(method_verb)
    try:
        namespace = get_kubectl_current_context_namespace()

        # TODO: check if kind field of inference instance Run is correct
        inference_instance = Run.get(name=name, namespace=namespace)
        if not inference_instance:
            handle_error(user_msg=Texts.INSTANCE_NOT_EXISTS_ERROR_MSG.format(
                name=name))
            exit(1)
        if not inference_instance.state == RunStatus.RUNNING:
            handle_error(user_msg=Texts.INSTANCE_NOT_RUNNING_ERROR_MSG.format(
                name=name, running_code=RunStatus.RUNNING.value))
            exit(1)

        inference_instance_url = get_inference_instance_url(
            inference_instance=inference_instance)
        stream_url = f'{inference_instance_url}:{method_verb.value}'
    except Exception:
        handle_error(logger,
                     Texts.INSTANCE_GET_FAIL_ERROR_MSG.format(name=name),
                     Texts.INSTANCE_GET_FAIL_ERROR_MSG.format(name=name),
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)

    try:
        with open(data, 'r', encoding='utf-8') as data_file:
            stream_data = json.load(data_file)
    except (json.JSONDecodeError, IOError):
        handle_error(logger, Texts.JSON_LOAD_ERROR_MSG.format(data=data),
                     Texts.JSON_LOAD_ERROR_MSG.format(data=data))
        exit(1)

    try:
        api_key = get_api_key()
        headers = {
            'Authorization': api_key,
            'Accept': 'application/json',
            'Content-Type': 'application/json'
        }
        with spinner(text=Texts.WAITING_FOR_RESPONSE_MSG):
            stream_response = requests.post(
                stream_url,
                data=json.dumps(stream_data),  # nosec - request to k8s cluster
                verify=False,
                headers=headers)
        stream_response.raise_for_status()
        click.echo(stream_response.text)
    except Exception as e:
        error_msg = Texts.INFERENCE_OTHER_ERROR_MSG.format(exception=e)
        if hasattr(e, 'response'):
            error_msg += Texts.INFERENCE_ERROR_RESPONSE_MSG.format(
                response_text=e.response.text)  # type: ignore
        handle_error(logger, error_msg, error_msg)
        exit(1)
Пример #10
0
def install(state: State, template_name: str):
    chart_file_location = os.path.join(Config.get_config_path(), "packs",
                                       template_name)

    with spinner(text=Texts.GETTING_LIST_OF_TEMPLATES_MSG):
        repository_name, access_token = get_repository_configuration()

        try:
            remote_templates = get_remote_templates(repository_name,
                                                    access_token)
        except ExceptionWithMessage as e:
            click.echo(e.message)
            sys.exit(1)

        remote_template_counterpart = remote_templates.get(template_name)

        if not remote_template_counterpart:
            click.echo(
                Texts.REMOTE_TEMPLATE_NOT_FOUND.format(
                    template_name=template_name))
            sys.exit(1)

    local_templates = get_local_templates()
    local_template_counterpart = local_templates.get(template_name)

    if local_template_counterpart:
        click.confirm(Texts.LOCAL_VERSION_ALREADY_INSTALLED.format(
            local_version=local_template_counterpart.local_version,
            template_name=local_template_counterpart.name,
            remote_version=remote_template_counterpart.remote_version),
                      abort=True)

        # noinspection PyBroadException
        try:
            shutil.rmtree(chart_file_location)
        except Exception:
            logger.exception("failed to remove local copy of template!")

    with spinner(text=Texts.DOWNLOADING_TEMPLATE):
        repository_name, access_token = get_repository_configuration()

        g = Github(repository_name, access_token)

        g.download_whole_directory(template_name, chart_file_location)

    click.echo("successfully installed!")
Пример #11
0
def update_resources_in_packs(cpu: str = None, memory: str = None):
    config_file_location = os.path.join(Config().config_path,
                                        NODE_CONFIG_FILENAME)

    if not os.path.isfile(config_file_location):
        handle_error(logger, Texts.MISSING_CONFIG_FILE,
                     Texts.MISSING_CONFIG_FILE)
        sys.exit(1)

    with open(config_file_location, 'r+', encoding='utf-8') as config_file, \
            spinner(text=Texts.CONFIG_UPDATE):
        config_file_content = yaml.safe_load(config_file)
        cpu_number = str(config_file_content.get(CPU_NUMBER_FIELDNAME))
        memory_amount = str(config_file_content.get(MEMORY_AMOUNT_FIELDNAME))
        cpu_system_required_min = str(
            config_file_content.get(CPU_SYSTEM_REQUIRED_MIN_FIELDNAME))
        cpu_system_required_percent = str(
            config_file_content.get(CPU_SYSTEM_REQUIRED_PERCENT_FIELDNAME))
        memory_system_required_min = str(
            config_file_content.get(MEMORY_SYSTEM_REQUIRED_MIN_FIELDNAME))
        memory_system_required_percent = str(
            config_file_content.get(MEMORY_SYSTEM_REQUIRED_PERCENT_FIELDNAME))

        if not cpu_number or cpu_number == "None" or not memory_amount or memory_amount == "None":
            handle_error(logger, Texts.CONFIG_FILE_INCORRECT,
                         Texts.CONFIG_FILE_INCORRECT)
            sys.exit(1)

        new_cpu = cpu if cpu else cpu_number
        new_memory = memory if memory else memory_amount

        try:
            override_values_in_packs(
                new_cpu_number=new_cpu,
                new_memory_amount=new_memory,
                current_cpu_number=cpu_number,
                current_mem_amount=memory_amount,
                cpu_system_required_min=cpu_system_required_min,
                cpu_system_required_percent=cpu_system_required_percent,
                mem_system_required_min=memory_system_required_min,
                mem_system_required_percent=memory_system_required_percent)
        except Exception:
            logger.exception(Texts.ERROR_DURING_UPDATE)
            handle_error(logger, Texts.ERROR_DURING_UPDATE,
                         Texts.ERROR_DURING_UPDATE)
            sys.exit(1)

        if new_cpu != cpu_number and new_memory != memory_amount:
            config_file.seek(0)
            config_file.truncate()
            config_file_content[CPU_NUMBER_FIELDNAME] = cpu
            config_file_content[MEMORY_AMOUNT_FIELDNAME] = memory
            yaml.safe_dump(config_file_content,
                           config_file,
                           default_flow_style=False,
                           explicit_start=True)
Пример #12
0
def launch_app(k8s_app_name: NAUTAAppNames,
               no_launch: bool = False,
               port: int = None,
               app_name: str = None,
               number_of_retries: int = 0,
               url_end: str = "",
               namespace: str = None):
    try:
        with spinner(text=Texts.LAUNCHING_APP_MSG) as proxy_spinner, \
             K8sProxy(nauta_app_name=k8s_app_name, port=port, app_name=app_name,
                      number_of_retries=number_of_retries, namespace=namespace) as proxy:
            url = FORWARDED_URL.format(proxy.tunnel_port, url_end)

            if k8s_app_name == NAUTAAppNames.INGRESS:
                config.load_kube_config()
                user_token = configuration.Configuration().api_key.get(
                    'authorization')
                prepared_user_token = user_token.replace('Bearer ', '')
                url = f'{url}?token={prepared_user_token}'

            if not no_launch:

                if is_gui_browser_available():
                    wait_for_connection(url)
                    webbrowser.open_new(url)
                    proxy_spinner.stop()
                else:
                    click.echo(Texts.NO_WEB_BROWSER_ERROR_MSG)

            if port and port != proxy.tunnel_port:
                click.echo(
                    Texts.CANNOT_USE_PORT.format(
                        required_port=port, random_port=proxy.tunnel_port))

            proxy_spinner.stop()
            click.echo(Texts.GO_TO_MSG.format(url=url))
            click.echo(Texts.PROXY_CREATED_MSG)
            wait_for_ctrl_c()
    except K8sProxyCloseError:
        err_message = Texts.PROXY_CLOSE_ERROR_MSG.format(app_name=k8s_app_name)
        raise ProxyClosingError(err_message)
    except LocalPortOccupiedError as exe:
        err_message = Texts.PROXY_CREATED_EXTENDED_ERROR_MSG.format(
            app_name=k8s_app_name, reason=exe.message)
        raise LaunchError(err_message)
    except K8sProxyOpenError:
        error_msg = Texts.PROXY_CREATED_ERROR_MSG.format(app_name=k8s_app_name)
        logger.exception(error_msg)
        raise LaunchError(error_msg)
    except LaunchError as e:
        raise e
    except Exception:
        err_message = Texts.WEB_APP_LAUCH_FAIL_MSG
        logger.exception(err_message)
        raise LaunchError(err_message)
Пример #13
0
def list_templates(state: State):
    """ List experiments. """
    with spinner(text=Texts.GETTING_LIST_OF_TEMPLATES_MSG):
        list_of_templates, error_messages = prepare_list_of_templates()

    for message in error_messages:
        click.echo(message)
    click.echo(
        tabulate.tabulate(list_of_templates,
                          headers=TEMPLATE_LIST_HEADERS,
                          tablefmt="orgtbl"))
Пример #14
0
def update_configuration(run_folder: str,
                         script_location: str,
                         script_parameters: Tuple[str, ...],
                         experiment_name: str,
                         run_name: str,
                         local_registry_port: int,
                         cluster_registry_port: int,
                         pack_type: str,
                         pack_params: List[Tuple[str, str]] = None,
                         script_folder_location: str = None,
                         env_variables: List[str] = None):
    """
    Updates configuration of a tf-training pack based on paramaters given by a user.

    The following files are modified:
    - Dockerfile - name of a training script is replaced with the one given by a user
                 - all additional files from experiment_folder are copied into an image
                   (excluding files generated by draft)
    - charts/templates/job.yaml - list of arguments is replaces with those given by a user

    :return:
    in case of any errors it throws an exception with a description of a problem
    """
    log.debug("Update configuration - start")

    try:
        modify_values_yaml(run_folder,
                           script_location,
                           script_parameters,
                           pack_params=pack_params,
                           experiment_name=experiment_name,
                           run_name=run_name,
                           pack_type=pack_type,
                           cluster_registry_port=cluster_registry_port,
                           env_variables=env_variables)
        with spinner(text=Texts.PREPARING_IMAGES_MSG.format(
                run_name=experiment_name)):
            modify_dockerfile(run_folder,
                              script_location,
                              local_registry_port=local_registry_port,
                              script_folder_location=script_folder_location)
        modify_draft_toml(run_folder,
                          registry=f'127.0.0.1:{local_registry_port}')
    except Exception as exe:
        log.exception("Update configuration - i/o error : {}".format(exe))
        raise RuntimeError(Texts.CONFIG_NOT_UPDATED) from exe

    log.debug("Update configuration - end")
Пример #15
0
def cancel(state: State, workflow_name: str):
    try:
        namespace = get_kubectl_current_context_namespace()
        workflow: ArgoWorkflow = ArgoWorkflow.get(name=workflow_name,
                                                  namespace=namespace)
        if not workflow:
            click.echo(Texts.NOT_FOUND_MSG.format(workflow_name=workflow_name))
            exit(0)
        with spinner(text=Texts.PROGRESS_MSG.format(
                workflow_name=workflow_name)):
            workflow.delete()
        click.echo(Texts.SUCCESS_MSG.format(workflow_name=workflow_name))
    except Exception:
        handle_error(logger,
                     Texts.OTHER_ERROR_MSG,
                     Texts.OTHER_ERROR_MSG,
                     add_verbosity_msg=True)
        exit(1)
Пример #16
0
def cancel_experiment_runs(runs_to_cancel: List[Run],
                           namespace: str) -> Tuple[List[Run], List[Run]]:
    """
    Cancel given list of Runs belonging to a single namespace.
    :param runs_to_cancel: Runs to be cancelled
    :param namespace: namespace where Run instances reside
    :return: tuple of list containing successfully Runs and list containing Runs that were not cancelled
    """
    deleted_runs = []
    not_deleted_runs = []
    try:
        for run in runs_to_cancel:
            logger.debug(f"Cancelling {run.name} run ...")
            click.echo(
                Texts.CANCELING_RUNS_START_MSG.format(
                    run_name=run.name, experiment_name=experiment_name))
            try:
                # if run status is cancelled - omit the following steps
                if run.state != RunStatus.CANCELLED:
                    with spinner(text=Texts.CANCEL_SETTING_STATUS_MSG.format(
                            run_name=run.name)):
                        delete_helm_release(release_name=run.name,
                                            namespace=namespace,
                                            purge=False)
                        # change a run state to CANCELLED
                        run.state = RunStatus.CANCELLED
                        run.end_timestamp = datetime.utcnow().strftime(
                            "%Y-%m-%dT%H:%M:%SZ")
                        run.update()
                deleted_runs.append(run)
            except Exception:
                logger.exception(
                    Texts.INCOMPLETE_CANCEL_ERROR_MSG.format(
                        run_name=run.name, experiment_name=experiment_name))
                click.echo(
                    Texts.INCOMPLETE_CANCEL_ERROR_MSG.format(
                        run_name=run.name, experiment_name=experiment_name))
                not_deleted_runs.append(run)

    except Exception:
        logger.exception("Error during cancelling experiments")
        return deleted_runs, not_deleted_runs

    return deleted_runs, not_deleted_runs
Пример #17
0
def purge_user(username: str):
    """
    Removes all system's artifacts that belong to a removed user.
    K8s objects are removed during removal of a namespace.
    :param username: name of a user for which artifacts should be removed
    It throws exception in case of any problems detected during removal of a user
    """
    # remove data from elasticsearch
    try:
        with k8s_proxy_context_manager.K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy,\
            spinner(text=TextsDel.DELETION_DELETING_USERS_EXPERIMENTS):
            es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port,
                                               verify_certs=False, use_ssl=False)
            es_client.delete_logs_for_namespace(username)
    except K8sProxyCloseError as exe:
        logger.exception("Error during closing of a proxy for elasticsearch.")
        raise exe
    except Exception as exe:
        logger.exception("Error during removal of data from elasticsearch")
        raise exe
Пример #18
0
def ctrl_c_handler_for_submit(sig, frame):
    log.debug("ctrl-c pressed while submitting")
    try:
        with spinner(text=Texts.CTRL_C_PURGING_PROGRESS_MSG):
            if submitted_runs:
                for run in submitted_runs:
                    try:
                        # delete run
                        delete_k8s_object("run", run.name)
                        # purge helm release
                        delete_helm_release(run.name, namespace=submitted_namespace, purge=True)
                    except Exception:
                        log.exception(Texts.ERROR_WHILE_REMOVING_RUNS)
            delete_k8s_object("experiment", submitted_experiment)
    except Exception:
        log.exception(Texts.ERROR_WHILE_REMOVING_EXPERIMENT)

    for proc in psutil.Process(os.getpid()).children(recursive=True):
        proc.send_signal(signal.SIGKILL)

    exit(1)
Пример #19
0
def submit(state: State, workflow_path: str):
    try:
        workflow: ArgoWorkflow = ArgoWorkflow.from_yaml(workflow_path)
        namespace = get_kubectl_current_context_namespace()
        with spinner(text=Texts.PROGRESS_MSG):
            workflow.create(namespace=namespace)
            workflow.namespace = namespace  # Set namespace, to properly display owner in CLI
        click.echo(
            tabulate([workflow.cli_representation],
                     headers=HEADERS,
                     tablefmt=TBLT_TABLE_FORMAT))
    except IOError as e:
        handle_error(logger, Texts.LOAD_SPEC_ERROR_MSG.format(msg=str(e)),
                     Texts.LOAD_SPEC_ERROR_MSG.format(msg=str(e)))
        exit(1)
    except Exception:
        handle_error(logger,
                     Texts.OTHER_ERROR_MSG,
                     Texts.OTHER_ERROR_MSG,
                     add_verbosity_msg=True)
        exit(1)
Пример #20
0
def delete(state: State, username: str, purge: bool):
    """
    Deletes a user with a name given as a parameter.

    :param username: name of a user that should be deleted
    :param purge: if set - command removes also all artifacts associated with a user
    """
    try:
        click.echo(Texts.DELETION_CHECK_PRESENCE)
        user_state = check_users_presence(username)

        if user_state == UserState.NOT_EXISTS:
            handle_error(user_msg=Texts.USER_NOT_EXISTS_ERROR_MSG.format(
                username=username))
            exit(1)

        if user_state == UserState.TERMINATING:
            handle_error(user_msg=Texts.USER_BEING_REMOVED_ERROR_MSG)
            exit(1)

    except Exception:
        handle_error(logger,
                     Texts.USER_PRESENCE_VERIFICATION_ERROR_MSG,
                     Texts.USER_PRESENCE_VERIFICATION_ERROR_MSG,
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)

    click.echo()
    if not click.confirm(Texts.DELETE_CONFIRM_MSG.format(username=username)):
        click.echo(Texts.DELETE_ABORT_MSG)
        exit(0)

    click.echo()

    try:
        click.echo(Texts.DELETION_START_DELETING)
        delete_user(username)

        patch_config_map_data(name=USER_DEL_CM,
                              namespace=NAUTA_NAMESPACE,
                              key=username,
                              value="1")

        if purge:
            try:
                click.echo(Texts.DELETION_START_PURGING)
                # failure during purging a user doesn't mean that user wasn't deleted
                purge_user(username)
            except Exception:
                handle_error(logger, Texts.PURGE_ERROR_MSG,
                             Texts.PURGE_ERROR_MSG)

        # CAN-616 - wait until user has been really deleted
        with spinner(text=Texts.DELETION_VERIFICATION_OF_DELETING
                     ) as user_del_spinner:
            for i in range(60):
                user_state = check_users_presence(username)

                user_del_cm_content = get_config_map_data(
                    name=USER_DEL_CM,
                    namespace=NAUTA_NAMESPACE,
                    request_timeout=1)
                if (not user_state or user_state == UserState.NOT_EXISTS) and \
                        (not user_del_cm_content or not user_del_cm_content.get(username)):
                    break
                time.sleep(1)
            else:
                user_del_spinner.stop()
                click.echo()
                click.echo(Texts.DELETE_IN_PROGRESS_MSG)
                exit(0)

        click.echo()
        click.echo(Texts.DELETE_SUCCESS_MSG.format(username=username))
    except K8sProxyCloseError:
        handle_error(logger,
                     Texts.PROXY_ERROR_LOG_MSG,
                     Texts.PROXY_ERROR_USER_MSG,
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)
    except Exception:
        handle_error(logger,
                     Texts.OTHER_ERROR_LOG_MSG,
                     Texts.OTHER_ERROR_USER_MSG,
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)
Пример #21
0
def prepare_experiment_environment(experiment_name: str, run_name: str, local_script_location: str,
                                   script_parameters: Tuple[str, ...],
                                   pack_type: str, local_registry_port: int, cluster_registry_port: int,
                                   script_folder_location: str = None,
                                   pack_params: List[Tuple[str, str]] = None,
                                   env_variables: List[str] = None,
                                   requirements_file: str = None) -> PrepareExperimentResult:
    """
    Prepares draft's environment for a certain run based on provided parameters
    :param experiment_name: name of an experiment
    :param run_name: name of an experiment run
    :param local_script_location: location of a script used for training purposes on local machine
    :param script_folder_location: location of an additional folder used in training
    :param script_parameters: parameters passed to a script
    :param pack_type: type of a pack used to start training job
    :param local_registry_port: port on which docker registry is accessible locally
    :param cluster_registry_port: port on which docker registry is accessible within nauta cluster
    :param pack_params: additional pack params
    :param env_variables: environmental variables to be passed to training
    :param requirements_file: path to a file with experiment requirements
    :return: name of folder with an environment created for this run, a name of script used for training purposes
            and count of Pods
    In case of any problems - an exception with a description of a problem is thrown
    """
    log.debug(f'Prepare run {run_name} environment - start')
    run_folder = get_run_environment_path(run_name)
    try:
        # check environment directory
        check_run_environment(run_folder)
        with spinner(text=Texts.CREATING_ENVIRONMENT_MSG.format(run_name=run_name)):
            # create an environment
            create_environment(run_name, local_script_location, script_folder_location)
            # generate draft's data
            output, exit_code, log_output = cmd.create(working_directory=run_folder, pack_type=pack_type)

            # copy requirements file if it was provided, create empty requirements file otherwise
            dest_requirements_file = os.path.join(run_folder, 'requirements.txt')
            if requirements_file:
                shutil.copyfile(requirements_file, dest_requirements_file)
            else:
                Path(dest_requirements_file).touch()

        if exit_code:
            raise SubmitExperimentError(Texts.DRAFT_TEMPLATES_NOT_GENERATED_ERROR_MSG.format(reason=log_output))

        # Script location on experiment container
        remote_script_location = Path(local_script_location).name if local_script_location else ''

        if pack_type in JUPYTER_NOTEBOOK_TEMPLATES_NAMES and remote_script_location.endswith(".py"):
                # for interact (jupyter notebooks) try to convert .py file into .ipynb
                py_script_location = os.path.join(run_folder, FOLDER_DIR_NAME, remote_script_location)
                ipynb_file_name = convert_py_to_ipynb(py_script_location, os.path.join(run_folder, FOLDER_DIR_NAME))
                local_script_location = ipynb_file_name

        # reconfigure draft's templates
        update_configuration(run_folder=run_folder, script_location=remote_script_location,
                             script_parameters=script_parameters,
                             experiment_name=experiment_name, run_name=run_name,
                             local_registry_port=local_registry_port, cluster_registry_port=cluster_registry_port,
                             pack_type=pack_type, pack_params=pack_params,
                             script_folder_location=script_folder_location,
                             env_variables=env_variables)

        pod_count = get_pod_count(run_folder=run_folder, pack_type=pack_type)
    except Exception as exe:
        delete_environment(run_folder)
        raise SubmitExperimentError('Problems during creation of environments.') from exe
    log.debug(f'Prepare run {run_name} environment - finish')
    return PrepareExperimentResult(folder_name=run_folder, script_name=local_script_location, pod_count=pod_count)
Пример #22
0
def submit_experiment(template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING,
                      script_location: str = None, script_parameters: Tuple[str, ...] = None,
                      pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None,
                      parameter_set: Tuple[str, ...] = None,
                      script_folder_location: str = None,
                      env_variables: List[str] = None,
                      requirements_file: str = None) -> (List[Run], Dict[str, str], str):

    script_parameters = script_parameters if script_parameters else ()
    parameter_set = parameter_set if parameter_set else ()
    parameter_range = parameter_range if parameter_range else []

    log.debug("Submit experiment - start")
    try:
        namespace = get_kubectl_current_context_namespace()
        global submitted_namespace
        submitted_namespace = namespace
    except Exception:
        message = Texts.GET_NAMESPACE_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    try:
        with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG):
            experiment_name, labels = generate_exp_name_and_labels(script_name=script_location,
                                                                   namespace=namespace, name=name,
                                                                   run_kind=run_kind)
            runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range,
                                             parameter_set=parameter_set, template_name=template)
    except SubmitExperimentError as exe:
        log.exception(str(exe))
        raise exe
    except Exception:
        message = Texts.SUBMIT_PREPARATION_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    global submitted_experiment
    submitted_experiment = experiment_name

    # Ctrl-C handling
    signal.signal(signal.SIGINT, ctrl_c_handler_for_submit)
    signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit)

    try:
        config = Config()

        # start port forwarding
        # noinspection PyBroadException
        with K8sProxy(NAUTAAppNames.DOCKER_REGISTRY, port=config.local_registry_port) as proxy:
            # Save port that was actually used in configuration
            if proxy.tunnel_port != config.local_registry_port:
                config.local_registry_port = proxy.tunnel_port

            experiment_run_folders = []  # List of local directories used by experiment's runs
            try:
                # run socat if on Windows or Mac OS
                if get_current_os() in (OS.WINDOWS, OS.MACOS):
                    # noinspection PyBroadException
                    try:
                        with spinner(text=Texts.CLUSTER_CONNECTION_MSG):
                            socat.start(proxy.tunnel_port)
                    except Exception:
                        error_msg = Texts.LOCAL_DOCKER_TUNNEL_ERROR_MSG
                        log.exception(error_msg)
                        raise SubmitExperimentError(error_msg)

                cluster_registry_port = get_app_service_node_port(nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY)

                # prepare environments for all experiment's runs
                for experiment_run in runs_list:
                    if script_parameters and experiment_run.parameters:
                        current_script_parameters = script_parameters + experiment_run.parameters
                    elif script_parameters:
                        current_script_parameters = script_parameters
                    elif experiment_run.parameters:
                        current_script_parameters = experiment_run.parameters
                    else:
                        current_script_parameters = ""

                    run_folder, script_location, pod_count = \
                        prepare_experiment_environment(experiment_name=experiment_name,
                                                       run_name=experiment_run.name,
                                                       local_script_location=script_location,
                                                       script_folder_location=script_folder_location,  # noqa: E501
                                                       script_parameters=current_script_parameters,
                                                       pack_type=template, pack_params=pack_params,
                                                       local_registry_port=proxy.tunnel_port,
                                                       cluster_registry_port=cluster_registry_port,
                                                       env_variables=env_variables,
                                                       requirements_file=requirements_file)
                    # Set correct pod count
                    if not pod_count or pod_count < 1:
                        raise SubmitExperimentError('Unable to determine pod count: make sure that values.yaml '
                                                    'file in your pack has podCount field with positive integer value.')
                    experiment_run.pod_count = pod_count

                    experiment_run_folders.append(run_folder)
                    script_name = None
                    if script_location is not None:
                        script_name = os.path.basename(script_location)

                    # Prepend script_name parameter to run description only for display purposes.
                    experiment_run.parameters = script_parameters if not experiment_run.parameters \
                        else experiment_run.parameters + script_parameters
                    if experiment_run.parameters and script_name:
                        experiment_run.parameters = (script_name, ) + experiment_run.parameters
                    elif script_name:
                        experiment_run.parameters = (script_name, )
            except SubmitExperimentError as e:
                log.exception(Texts.ENV_CREATION_ERROR_MSG)
                e.message += f' {Texts.ENV_CREATION_ERROR_MSG}'
                raise
            except Exception:
                # any error in this step breaks execution of this command
                message = Texts.ENV_CREATION_ERROR_MSG
                log.exception(message)
                # just in case - remove folders that were created with a success
                for experiment_run_folder in experiment_run_folders:
                    delete_environment(experiment_run_folder)

            # if ps or pr option is used - first ask whether experiment(s) should be submitted
            if parameter_range or parameter_set:
                click.echo(Texts.CONFIRM_SUBMIT_MSG)
                click.echo(tabulate({RUN_NAME: [run.name for run in runs_list],
                                     RUN_PARAMETERS: ["\n".join(run.parameters) if run.parameters
                                                      else "" for run in runs_list]},
                                    headers=[RUN_NAME, RUN_PARAMETERS], tablefmt="orgtbl"))

                if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True):
                    for experiment_run_folder in experiment_run_folders:
                        delete_environment(experiment_run_folder)
                    exit()

            # create Experiment model
            # TODO template_name & template_namespace should be filled after Template implementation
            parameter_range_spec = [f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range]
            parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set]
            experiment_parameters_spec = list(script_parameters) + parameter_range_spec + parameter_set_spec
            experiment = experiments_model.Experiment(name=experiment_name, template_name=template,
                                                      parameters_spec=experiment_parameters_spec,
                                                      template_namespace="template-namespace")

            experiment.create(namespace=namespace, labels=labels)

            # submit runs
            run_errors = {}
            for run, run_folder in zip(runs_list, experiment_run_folders):
                try:
                    run.state = RunStatus.QUEUED
                    with spinner(text=Texts.CREATING_RESOURCES_MSG.format(run_name=run.name)):
                        # Add Run object with runKind label and pack params as annotations
                        run.create(namespace=namespace, labels={'runKind': run_kind.value},
                                   annotations={pack_param_name: pack_param_value
                                                for pack_param_name, pack_param_value in pack_params})
                        submitted_runs.append(run)
                        submit_draft_pack(run_folder, namespace=namespace)
                except Exception as exe:
                    delete_environment(run_folder)
                    try:
                        run.state = RunStatus.FAILED
                        run_errors[run.name] = str(exe)
                        run.update()
                    except Exception as rexe:
                        # update of non-existing run may fail
                        log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(str(rexe)))

            # Delete experiment if no Runs were submitted
            if not submitted_runs:
                click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG)
                delete_k8s_object("experiment", experiment_name)

            # Change experiment status to submitted
            experiment.state = experiments_model.ExperimentStatus.SUBMITTED
            experiment.update()
    except LocalPortOccupiedError as exe:
        click.echo(exe.message)
        raise SubmitExperimentError(exe.message)
    except K8sProxyCloseError:
        log.exception('Error during closing of a proxy for a {}'.format(NAUTAAppNames.DOCKER_REGISTRY))
        raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG)
    except K8sProxyOpenError:
        error_msg = Texts.PROXY_OPEN_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg)
    except SubmitExperimentError:
        raise
    except Exception as exe:
        error_msg = Texts.SUBMIT_OTHER_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg) from exe
    finally:
        with spinner(text=Texts.CLUSTER_CONNECTION_CLOSING_MSG):
            # noinspection PyBroadException
            try:
                socat.stop()
            except Exception:
                log.exception("Error during closing of a proxy for a local docker-host tunnel")
                raise K8sProxyCloseError(Texts.DOCKER_TUNNEL_CLOSE_ERROR_MSG)
        # remove semaphores from all exp folders
        remove_sempahore(experiment_name)

    log.debug("Submit - finish")
    return runs_list, run_errors, script_location
Пример #23
0
def install(ctx: click.Context, template_name: str):
    packs_location = os.path.join(Config.get_config_path(), "packs")
    chart_file_location = os.path.join(packs_location, template_name)
    repository_address = get_repository_address()

    with spinner(
            text=Texts.GETTING_LIST_OF_TEMPLATES_MSG) as templates_spinner:
        try:
            remote_template = load_remote_template(
                template_name, repository_address=repository_address)
        except Exception:
            templates_spinner.stop()
            handle_error(logger,
                         user_msg=Texts.FAILED_TO_LOAD_TEMPLATE.format(
                             template_name=template_name),
                         log_msg=Texts.FAILED_TO_LOAD_TEMPLATE.format(
                             template_name=template_name),
                         add_verbosity_msg=ctx.obj.verbosity == 0)
            sys.exit(1)

        if not remote_template:
            templates_spinner.stop()
            handle_error(logger,
                         user_msg=Texts.REMOTE_TEMPLATE_NOT_FOUND.format(
                             template_name=template_name),
                         log_msg=Texts.REMOTE_TEMPLATE_NOT_FOUND.format(
                             template_name=template_name),
                         add_verbosity_msg=ctx.obj.verbosity == 0)
            sys.exit(1)

    local_templates = get_local_templates()
    local_template_counterpart = local_templates.get(template_name)

    if local_template_counterpart:
        if (not click.get_current_context().obj.force) and (not click.confirm(
                Texts.LOCAL_VERSION_ALREADY_INSTALLED.format(
                    local_version=local_template_counterpart.local_version,
                    template_name=local_template_counterpart.name,
                    remote_version=remote_template.remote_version))):
            sys.exit(0)
        # noinspection PyBroadException
        try:
            shutil.rmtree(chart_file_location)
        except Exception:
            logger.exception("failed to remove local copy of template!")

    with spinner(text=Texts.DOWNLOADING_TEMPLATE) as download_spinner:
        try:
            download_remote_template(template=remote_template,
                                     repository_address=repository_address,
                                     output_dir_path=packs_location)
        except Exception:
            download_spinner.stop()
            handle_error(logger,
                         user_msg=Texts.FAILED_TO_INSTALL_TEMPLATE.format(
                             template_name=template_name,
                             repository_name=repository_address),
                         log_msg=Texts.FAILED_TO_INSTALL_TEMPLATE.format(
                             template_name=template_name,
                             repository_name=repository_address),
                         add_verbosity_msg=ctx.obj.verbosity == 0)
            sys.exit(1)

    update_resources_in_packs()

    click.echo("successfully installed!")
Пример #24
0
def verify(state: State):
    try:
        with spinner(text=Texts.CHECKING_OS_MSG):
            check_os()
        click.echo(Texts.OS_SUPPORTED_MSG)
    except InvalidOsError as exception:
        handle_error(logger,
                     str(exception),
                     str(exception),
                     add_verbosity_msg=True)
        exit(1)

    dependencies = get_dependency_map()
    kubectl_dependency_name = 'kubectl'
    kubectl_dependency_spec = dependencies[kubectl_dependency_name]

    with spinner(text=Texts.VERIFYING_DEPENDENCY_MSG.format(
            dependency_name=kubectl_dependency_name)):
        valid, installed_version = check_dependency(
            dependency_name=kubectl_dependency_name,
            dependency_spec=kubectl_dependency_spec)

    supported_versions_sign = '>='
    logger.info(
        Texts.VERSION_CHECKING_MSG.format(
            dependency_name=kubectl_dependency_name,
            installed_version=installed_version,
            supported_versions_sign=supported_versions_sign,
            expected_version=kubectl_dependency_spec.expected_version))

    if valid:
        click.echo(
            Texts.DEPENDENCY_VERIFICATION_SUCCESS_MSG.format(
                dependency_name=kubectl_dependency_name))
    else:
        handle_error(
            logger,
            Texts.KUBECTL_INVALID_VERSION_ERROR_MSG.format(
                installed_version=installed_version,
                supported_versions_sign=supported_versions_sign,
                expected_version=  # noqa
                kubectl_dependency_spec.expected_version),
            Texts.KUBECTL_INVALID_VERSION_ERROR_MSG,
            add_verbosity_msg=state.verbosity == 0)
        exit(1)

    del dependencies[kubectl_dependency_name]

    try:
        with spinner(text=Texts.CHECKING_CONNECTION_TO_CLUSTER_MSG):
            check_connection_to_cluster()
        with spinner(text=Texts.CHECKING_PORT_FORWARDING_FROM_CLUSTER_MSG):
            check_port_forwarding()
    except KubectlConnectionError as e:
        handle_error(logger,
                     str(e),
                     str(e),
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)
    except FileNotFoundError:
        handle_error(logger,
                     Texts.KUBECTL_NOT_INSTALLED_ERROR_MSG,
                     Texts.KUBECTL_NOT_INSTALLED_ERROR_MSG,
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)

    try:
        namespace = 'kube-system' if is_current_user_administrator(
        ) else get_kubectl_current_context_namespace()
    except Exception:
        handle_error(logger,
                     Texts.GET_K8S_NAMESPACE_ERROR_MSG,
                     Texts.GET_K8S_NAMESPACE_ERROR_MSG,
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)

    dependency_versions = {}
    for dependency_name, dependency_spec in dependencies.items():
        try:
            supported_versions_sign = '==' if dependency_spec.match_exact_version else '>='
            with spinner(text=Texts.VERIFYING_DEPENDENCY_MSG.format(
                    dependency_name=dependency_name)):
                valid, installed_version = check_dependency(
                    dependency_name=dependency_name,
                    dependency_spec=dependency_spec,
                    namespace=namespace)
            dependency_versions[dependency_name] = installed_version
            logger.info(
                Texts.VERSION_CHECKING_MSG.format(
                    dependency_name=dependency_name,
                    installed_version=installed_version,
                    supported_versions_sign=supported_versions_sign,
                    expected_version=dependency_spec.expected_version))
            if valid:
                click.echo(
                    Texts.DEPENDENCY_VERIFICATION_SUCCESS_MSG.format(
                        dependency_name=dependency_name))
            else:
                click.echo(
                    Texts.INVALID_VERSION_WARNING_MSG.format(
                        dependency_name=dependency_name,
                        installed_version=installed_version,
                        supported_versions_sign=supported_versions_sign,
                        expected_version=dependency_spec.expected_version))
        except FileNotFoundError:
            handle_error(logger,
                         Texts.DEPENDENCY_NOT_INSTALLED_ERROR_MSG.format(
                             dependency_name=dependency_name),
                         Texts.DEPENDENCY_NOT_INSTALLED_ERROR_MSG.format(
                             dependency_name=dependency_name),
                         add_verbosity_msg="client" not in dependency_name)
            exit(1)
        except (RuntimeError, ValueError, TypeError):
            handle_error(logger,
                         Texts.DEPENDENCY_VERSION_CHECK_ERROR_MSG.format(
                             dependency_name=dependency_name),
                         Texts.DEPENDENCY_VERSION_CHECK_ERROR_MSG.format(
                             dependency_name=dependency_name),
                         add_verbosity_msg=state.verbosity == 0)
            exit(1)
        except Exception:
            handle_error(logger,
                         Texts.DEPENDENCY_VERIFICATION_OTHER_ERROR_MSG.format(
                             dependency_name=dependency_name),
                         Texts.DEPENDENCY_VERIFICATION_OTHER_ERROR_MSG.format(
                             dependency_name=dependency_name),
                         add_verbosity_msg=state.verbosity == 0)
            exit(1)
    else:
        # This block is entered if all dependencies were validated successfully
        # Save dependency versions in a file
        save_dependency_versions(dependency_versions)
Пример #25
0
def submit_experiment(
    template: str,
    name: str = None,
    run_kind: RunKinds = RunKinds.TRAINING,
    script_location: str = None,
    script_parameters: Tuple[str, ...] = None,
    pack_params: List[Tuple[str, str]] = None,
    parameter_range: List[Tuple[str, str]] = None,
    parameter_set: Tuple[str, ...] = None,
    script_folder_location: str = None,
    env_variables: List[str] = None,
    requirements_file: str = None
) -> Tuple[List[Run], Dict[str, str], Optional[str]]:

    script_parameters: Union[Tuple[str, ...], Tuple[(
    )]] = script_parameters if script_parameters else ()
    parameter_set: Union[Tuple[str, ...],
                         Tuple[()]] = parameter_set if parameter_set else ()
    parameter_range = parameter_range if parameter_range else []
    pack_params = pack_params if pack_params else []

    log.debug("Submit experiment - start")
    try:
        namespace = get_kubectl_current_context_namespace()
        global submitted_namespace
        submitted_namespace = namespace
    except Exception:
        message = Texts.GET_NAMESPACE_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    try:
        with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG):
            experiment_name, labels = generate_exp_name_and_labels(
                script_name=script_location,
                namespace=namespace,
                name=name,
                run_kind=run_kind)
            runs_list = prepare_list_of_runs(experiment_name=experiment_name,
                                             parameter_range=parameter_range,
                                             parameter_set=parameter_set,
                                             template_name=template)
    except SubmitExperimentError as exe:
        log.exception(str(exe))
        raise exe
    except Exception:
        message = Texts.SUBMIT_PREPARATION_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    global submitted_experiment
    submitted_experiment = experiment_name

    # Ctrl-C handling
    signal.signal(signal.SIGINT, ctrl_c_handler_for_submit)
    signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit)

    try:
        experiment_run_folders = [
        ]  # List of local directories used by experiment's runs
        try:
            cluster_registry_port = get_app_service_node_port(
                nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY)
            # prepare environments for all experiment's runs
            for experiment_run in runs_list:
                if script_parameters and experiment_run.parameters:
                    current_script_parameters = script_parameters + experiment_run.parameters
                elif script_parameters:
                    current_script_parameters = script_parameters
                elif experiment_run.parameters:
                    current_script_parameters = experiment_run.parameters
                else:
                    current_script_parameters = None
                run_folder, script_location, pod_count = \
                    prepare_experiment_environment(experiment_name=experiment_name,
                                                   run_name=experiment_run.name,
                                                   local_script_location=script_location,
                                                   script_folder_location=script_folder_location,  # noqa: E501
                                                   script_parameters=current_script_parameters,
                                                   pack_type=template, pack_params=pack_params,
                                                   cluster_registry_port=cluster_registry_port,
                                                   env_variables=env_variables,
                                                   requirements_file=requirements_file,
                                                   username=namespace,
                                                   run_kind=run_kind)
                # Set correct pod count
                if not pod_count or pod_count < 1:
                    raise SubmitExperimentError(
                        'Unable to determine pod count: make sure that values.yaml '
                        'file in your pack has podCount field with positive integer value.'
                    )

                experiment_run.pod_count = pod_count
                experiment_run_folders.append(run_folder)
                script_name = None
                if script_location is not None:
                    script_name = os.path.basename(script_location)
                # Prepend script_name parameter to run description only for display purposes.
                experiment_run.parameters = script_parameters if not experiment_run.parameters \
                    else experiment_run.parameters + script_parameters
                if experiment_run.parameters and script_name:
                    experiment_run.parameters = (
                        script_name, ) + experiment_run.parameters
                elif script_name:
                    experiment_run.parameters = (script_name, )
        except SubmitExperimentError as e:
            log.exception(Texts.ENV_CREATION_ERROR_MSG)
            e.message += f' {Texts.ENV_CREATION_ERROR_MSG}'
            raise
        except Exception:
            # any error in this step breaks execution of this command
            message = Texts.ENV_CREATION_ERROR_MSG
            log.exception(message)
            # just in case - remove folders that were created with a success
            for experiment_run_folder in experiment_run_folders:
                delete_environment(experiment_run_folder)
        # if ps or pr option is used - first ask whether experiment(s) should be submitted
        if parameter_range or parameter_set:
            click.echo(Texts.CONFIRM_SUBMIT_MSG)
            click.echo(
                tabulate(
                    {
                        RUN_NAME: [run.name for run in runs_list],
                        RUN_PARAMETERS: [
                            "\n".join(run.parameters) if run.parameters else ""
                            for run in runs_list
                        ]
                    },
                    headers=[RUN_NAME, RUN_PARAMETERS],
                    tablefmt=TBLT_TABLE_FORMAT))
            if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG,
                                 default=True):
                for experiment_run_folder in experiment_run_folders:
                    delete_environment(experiment_run_folder)
                exit()
        # create Experiment model
        # TODO template_name & template_namespace should be filled after Template implementation
        parameter_range_spec = [
            f'-pr {param_name} {param_value}'
            for param_name, param_value in parameter_range
        ]
        parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set]
        experiment_parameters_spec = list(
            script_parameters) + parameter_range_spec + parameter_set_spec
        template_version = get_template_version(template)
        experiment = experiments_model.Experiment(
            name=experiment_name,
            template_name=template,
            parameters_spec=experiment_parameters_spec,
            template_namespace="template-namespace",
            template_version=template_version)
        experiment.create(namespace=namespace, labels=labels)

        with spinner('Uploading experiment...'):
            try:
                upload_experiment_to_git_repo_manager(
                    experiments_workdir=get_run_environment_path(''),
                    experiment_name=experiment_name,
                    run_name=runs_list[0].name,
                    username=namespace)
            except Exception:
                log.exception('Failed to upload experiment.')
                try:
                    experiment.state = experiments_model.ExperimentStatus.FAILED
                    experiment.update()
                except Exception:
                    log.exception(
                        f'Failed to set state of {experiment.name} experiment '
                        f'to {experiments_model.ExperimentStatus.FAILED}')
                raise SubmitExperimentError('Failed to upload experiment.')

        with spinner('Building experiment image...'):
            try:
                image_build_workflow: ExperimentImageBuildWorkflow = ExperimentImageBuildWorkflow.from_yaml(
                    yaml_template_path=
                    f'{Config().config_path}/workflows/{EXP_IMAGE_BUILD_WORKFLOW_SPEC}',
                    username=namespace,
                    experiment_name=experiment_name)
                image_build_workflow.create(namespace=namespace)
                image_build_workflow.wait_for_completion()
            except Exception:
                error_msg = 'Failed to build experiment image.'
                log.exception(error_msg)
                # Try to get workflow logs
                _debug_workflow_logs(workflow=image_build_workflow,
                                     namespace=namespace)

                if image_build_workflow.name:
                    error_msg += f' Run nctl workflow logs {image_build_workflow.name} command for more details.'

                try:
                    experiment.state = experiments_model.ExperimentStatus.FAILED
                    experiment.update()
                except Exception:
                    log.exception(
                        f'Failed to set state of {experiment.name} experiment '
                        f'to {experiments_model.ExperimentStatus.FAILED}')
                raise SubmitExperimentError(error_msg)
        # submit runs
        run_errors: Dict[str, str] = {}
        for run, run_folder in zip(runs_list, experiment_run_folders):
            try:
                run.state = RunStatus.QUEUED
                with spinner(text=Texts.CREATING_RESOURCES_MSG.format(
                        run_name=run.name)):
                    # Add Run object with runKind label and pack params as annotations
                    run.create(namespace=namespace,
                               labels={'runKind': run_kind.value},
                               annotations={
                                   pack_param_name: pack_param_value
                                   for pack_param_name, pack_param_value in
                                   pack_params
                               })
                    submitted_runs.append(run)
                    submit_draft_pack(run_name=run.name,
                                      run_folder=run_folder,
                                      namespace=namespace)
            except Exception as exe:
                delete_environment(run_folder)
                try:
                    run.state = RunStatus.FAILED
                    run_errors[run.name] = str(exe)
                    run.update()
                except Exception as rexe:
                    # update of non-existing run may fail
                    log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(
                        str(rexe)))
        # Delete experiment if no Runs were submitted
        if not submitted_runs:
            click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG)
            delete_k8s_object("experiment", experiment_name)
        # Change experiment status to submitted
        experiment.state = experiments_model.ExperimentStatus.SUBMITTED
        experiment.update()
    except LocalPortOccupiedError as exe:
        click.echo(exe.message)
        raise SubmitExperimentError(exe.message)
    except K8sProxyCloseError:
        log.exception('Error during closing of a proxy for a {}'.format(
            NAUTAAppNames.DOCKER_REGISTRY))
        raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG)
    except K8sProxyOpenError:
        error_msg = Texts.PROXY_OPEN_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg)
    except SubmitExperimentError:
        raise
    except Exception as exe:
        error_msg = Texts.SUBMIT_OTHER_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg) from exe
    finally:
        # remove semaphores from all exp folders
        remove_sempahore(experiment_name)

    log.debug("Submit - finish")
    return runs_list, run_errors, script_location
Пример #26
0
def purge_experiment(exp_name: str, runs_to_purge: List[Run],
                     k8s_es_client: K8sElasticSearchClient,
                     namespace: str) -> Tuple[List[Run], List[Run]]:
    """
       Purge experiment with a given name by cancelling runs given as a parameter. If given experiment
       contains more runs than is in the list of runs - experiment's state remains intact.

       :param exp_name: name of an experiment to which belong runs passed in run_list parameter
       :param runs_to_purge: list of runs that should be purged, they have to belong to exp_name experiment
       :param k8s_es_client: Kubernetes ElasticSearch client
       :param namespace: namespace where experiment is located
       :return: two list - first contains runs that were cancelled successfully, second - those which weren't
       """
    logger.debug(f"Purging {exp_name} experiment ...")

    purged_runs: List[Run] = []
    not_purged_runs: List[Run] = []

    experiment = Experiment.get(name=exp_name, namespace=namespace)
    if not experiment:
        raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG)

    experiment_runs = Run.list(namespace=namespace, exp_name_filter=[exp_name])
    # check whether experiment has more runs that should be cancelled
    cancel_whole_experiment = (len(experiment_runs) == len(runs_to_purge))
    if cancel_whole_experiment:
        experiment.state = ExperimentStatus.CANCELLING
        experiment.update()

    try:
        cancelled_runs, not_cancelled_runs = cancel_experiment_runs(
            runs_to_cancel=runs_to_purge, namespace=namespace)
        not_purged_runs = not_cancelled_runs

        if cancel_whole_experiment:
            # Delete associated workflows
            experiment_associated_workflows = [
                wf for wf in ArgoWorkflow.list(namespace=namespace)
                if wf.labels.get('experimentName') == experiment.name
            ]
            for wf in experiment_associated_workflows:
                wf.delete()

            # Remove tags from git repo manager
            try:
                delete_exp_tag_from_git_repo_manager(
                    experiment_name=experiment.name,
                    username=namespace,
                    experiments_workdir=get_run_environment_path(''))
            except Exception:
                handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG,
                             Texts.GIT_REPO_MANAGER_ERROR_MSG)
                raise

        for run in cancelled_runs:
            logger.debug(f"Purging {run.name} run ...")
            click.echo(Texts.PURGING_START_MSG.format(run_name=run.name))
            try:
                with spinner(text=Texts.PURGING_PROGRESS_MSG.format(
                        run_name=run.name)):
                    # purge helm release
                    delete_helm_release(run.name,
                                        namespace=namespace,
                                        purge=True)
                    # delete run
                    kubectl.delete_k8s_object("run", run.name)
                    purged_runs.append(run)
            except Exception as exe:
                not_purged_runs.append(run)
                logger.exception("Error during purging runs.")
                # occurence of NotFound error may mean, that run has been removed earlier
                if "NotFound" not in str(exe):
                    click.echo(
                        Texts.INCOMPLETE_PURGE_ERROR_MSG.format(
                            experiment_name=experiment_name))
                    raise exe
            try:
                # clear run logs
                if is_current_user_administrator():
                    logger.debug(f"Clearing logs for {run.name} run.")
                    with spinner(text=Texts.PURGING_LOGS_PROGRESS_MSG.format(
                            run_name=run.name)):
                        k8s_es_client.delete_logs_for_run(run=run.name,
                                                          namespace=namespace)
            except Exception:
                logger.exception("Error during clearing run logs.")

            # CAN-1099 - docker garbage collector has errors that prevent from correct removal of images
            # try:
            # try to remove images from docker registry
            #    delete_images_for_experiment(exp_name=run.name)
            # except Exception:
            #    logger.exception("Error during removing images.")

        if cancel_whole_experiment and not not_purged_runs:
            try:
                kubectl.delete_k8s_object("experiment", exp_name)
            except Exception:
                # problems during deleting experiments are hidden as if runs were
                # cancelled user doesn't have a possibility to remove them
                logger.exception("Error during purging experiment.")

    except Exception:
        logger.exception("Error during purging experiment.")
        return purged_runs, not_purged_runs

    return purged_runs, not_purged_runs
Пример #27
0
def create(state: State, username: str, list_only: bool, filename: str):
    """
    Adds a new user with a name given as a parameter.

    :param username: name of a new user
    """

    if list_only and filename:
        handle_error(user_msg=Texts.F_L_OPTIONS_EXCLUSION_ERROR_MSG)
        exit(1)

    try:
        try:
            validate_user_name(username)
        except ValueError as exe:
            handle_error(
                logger,
                Texts.NAME_VALIDATION_ERROR_MSG.format(username=username),
                str(exe),
                add_verbosity_msg=state.verbosity == 0)
            exit(1)

        if not is_current_user_administrator():
            handle_error(logger, Texts.USER_NOT_ADMIN_ERROR_MSG,
                         Texts.USER_NOT_ADMIN_ERROR_MSG)
            exit(1)

        user_state = check_users_presence(username)

        if user_state == UserState.ACTIVE:
            handle_error(
                logger,
                Texts.USER_ALREADY_EXISTS_ERROR_MSG.format(username=username),
                Texts.USER_ALREADY_EXISTS_ERROR_MSG.format(username=username))
            exit(1)

        if user_state == UserState.TERMINATING:
            handle_error(
                logger,
                Texts.USER_BEING_REMOVED_ERROR_MSG.format(username=username),
                Texts.USER_BEING_REMOVED_ERROR_MSG.format(username=username))
            exit(1)

    except Exception:
        handle_error(
            logger,
            Texts.USER_VERIFICATION_ERROR_MSG.format(username=username),
            Texts.USER_VERIFICATION_ERROR_MSG.format(username=username),
            add_verbosity_msg=state.verbosity == 0)
        exit(1)

    try:
        with spinner(text=Texts.CREATING_USER_PROGRESS_MSG.format(
                username=username)):
            chart_location = os.path.join(Config().config_path,
                                          ADD_USER_CHART_NAME)

            nauta_config_map = NAUTAConfigMap()

            tiller_location = nauta_config_map.image_tiller
            tensorboard_service_location = nauta_config_map.image_tensorboard_service

            add_user_command = [
                "helm", "install", "--wait", "--namespace", username, "--name",
                username, chart_location, "--set", "global.nauta=nauta",
                "--set", f"username={username}", "--set",
                "TillerImage={}".format(tiller_location), "--set",
                f"TensorboardServiceImage={tensorboard_service_location}"
            ]
            env = os.environ.copy()
            env['PATH'] = Config().config_path + os.pathsep + env['PATH']
            _, err_code, log_output = execute_system_command(
                ' '.join(add_user_command), env=env, shell=True)

        if err_code:
            handle_error(logger,
                         log_output,
                         Texts.USER_ADD_ERROR_MSG,
                         add_verbosity_msg=state.verbosity == 0)

            if not delete_user(username):
                handle_error(user_msg=Texts.REMOVE_USER_ERROR_MSG.format(
                    username=username))
            sys.exit(1)

        try:
            users_password = get_users_token(username)
        except Exception:
            handle_error(logger,
                         Texts.PASSWORD_GATHER_ERROR_MSG,
                         Texts.PASSWORD_GATHER_ERROR_MSG,
                         add_verbosity_msg=state.verbosity == 0)
            users_password = ""

        try:
            cert = get_certificate(username)
        except Exception:
            handle_error(logger,
                         Texts.CERT_GATHER_ERROR_MSG,
                         Texts.CERT_GATHER_ERROR_MSG,
                         add_verbosity_msg=state.verbosity == 0)
            cert = ""

    except Exception:
        handle_error(logger,
                     Texts.USER_ADD_ERROR_MSG.format(username=username),
                     Texts.USER_ADD_ERROR_MSG.format(username=username),
                     add_verbosity_msg=state.verbosity == 0)
        if not delete_user(username):
            handle_error(user_msg=Texts.REMOVE_USER_ERROR_MSG.format(
                username=username))
        sys.exit(1)

    if is_user_created(username, 90):
        click.echo(Texts.USER_CREATION_SUCCESS_MSG.format(username=username))
    else:
        # if during 90 seconds a user hasn't been created - app displays information about it
        # but don't step processing the command - config file generated here my be useful later
        # when user has been created
        click.echo(Texts.USER_NOT_READY_ERROR_MSG.format(username=username))

    try:
        kubeconfig = generate_kubeconfig(username, username,
                                         get_kubectl_host(), users_password,
                                         cert)
    except Exception:
        handle_error(logger,
                     Texts.CONFIG_CREATION_ERROR_MSG,
                     Texts.CONFIG_CREATION_ERROR_MSG,
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)

    if list_only:
        click.echo(Texts.LIST_ONLY_HEADER)
        click.echo(kubeconfig)
    else:
        if not filename:
            filename = DEFAULT_FILENAME.format(username)
        try:
            with open(filename, "w") as file:
                file.write(kubeconfig)

            click.echo(Texts.CONFIG_SAVE_SUCCESS_MSG.format(filename=filename))
        except Exception:
            handle_error(logger,
                         Texts.CONFIG_SAVE_FAIL_MSG,
                         Texts.CONFIG_SAVE_FAIL_MSG,
                         add_verbosity_msg=state.verbosity == 0)
            click.echo(Texts.CONFIG_SAVE_FAIL_INSTRUCTIONS_MSG)
            click.echo(kubeconfig)
            sys.exit(1)
Пример #28
0
def config(state: State, cpu: str, memory: str):

    if not cpu or not memory:
        handle_error(logger, Texts.MISSING_ARGUMENTS, Texts.MISSING_ARGUMENTS)
        sys.exit(1)

    if not validate_cpu_settings(cpu):
        handle_error(logger, Texts.CPU_WRONG_FORMAT, Texts.CPU_WRONG_FORMAT)
        sys.exit(1)

    if not validate_memory_settings(memory):
        handle_error(logger, Texts.MEMORY_WRONG_FORMAT,
                     Texts.MEMORY_WRONG_FORMAT)
        sys.exit(1)

    configuration = NAUTAConfigMap()

    if configuration.minimal_node_memory_amount and \
       convert_k8s_memory_resource(configuration.minimal_node_memory_amount) > convert_k8s_memory_resource(memory):
        error_message = Texts.MEMORY_SETTINGS_TOO_LOW.format(
            memory_value=configuration.minimal_node_memory_amount)
        handle_error(logger, error_message, error_message)
        sys.exit(1)

    if configuration.minimal_node_cpu_number and \
       convert_k8s_cpu_resource(configuration.minimal_node_cpu_number) > convert_k8s_cpu_resource(cpu):
        error_message = Texts.CPU_SETTINGS_TOO_LOW.format(
            cpu_value=configuration.minimal_node_cpu_number)
        handle_error(logger, error_message, error_message)
        sys.exit(1)

    config_file_location = os.path.join(Config().config_path,
                                        NODE_CONFIG_FILENAME)

    if not os.path.isfile(config_file_location):
        handle_error(logger, Texts.MISSING_CONFIG_FILE,
                     Texts.MISSING_CONFIG_FILE)
        sys.exit(1)

    with open(config_file_location, 'r+', encoding='utf-8') as config_file, \
            spinner(text=Texts.CONFIG_UPDATE):
        config_file_content = yaml.safe_load(config_file)
        cpu_number = str(config_file_content.get(CPU_NUMBER_FIELDNAME))
        memory_amount = str(config_file_content.get(MEMORY_AMOUNT_FIELDNAME))
        cpu_system_required_min = str(
            config_file_content.get(CPU_SYSTEM_REQUIRED_MIN_FIELDNAME))
        cpu_system_required_percent = str(
            config_file_content.get(CPU_SYSTEM_REQUIRED_PERCENT_FIELDNAME))
        memory_system_required_min = str(
            config_file_content.get(MEMORY_SYSTEM_REQUIRED_MIN_FIELDNAME))
        memory_system_required_percent = str(
            config_file_content.get(MEMORY_SYSTEM_REQUIRED_PERCENT_FIELDNAME))

        if not cpu_number or cpu_number == "None" or not memory_amount or memory_amount == "None":
            handle_error(logger, Texts.CONFIG_FILE_INCORRECT,
                         Texts.CONFIG_FILE_INCORRECT)
            sys.exit(1)

        try:
            override_values_in_packs(
                new_cpu_number=cpu,
                new_memory_amount=memory,
                current_cpu_number=cpu_number,
                current_mem_amount=memory_amount,
                cpu_system_required_min=cpu_system_required_min,
                cpu_system_required_percent=cpu_system_required_percent,
                mem_system_required_min=memory_system_required_min,
                mem_system_required_percent=memory_system_required_percent)
        except Exception:
            logger.exception(Texts.ERROR_DURING_UPDATE)
            handle_error(logger, Texts.ERROR_DURING_UPDATE,
                         Texts.ERROR_DURING_UPDATE)
            sys.exit(1)

        config_file.seek(0)
        config_file.truncate()
        config_file_content[CPU_NUMBER_FIELDNAME] = cpu
        config_file_content[MEMORY_AMOUNT_FIELDNAME] = memory
        yaml.dump(config_file_content,
                  config_file,
                  default_flow_style=False,
                  explicit_start=True)

    click.echo(Texts.SUCCESS_MESSAGE)