Пример #1
0
def check_nauta_pods():
    """
    Check if there are failed pods. If there are any, display a list of their names and
    save logs in logs directory.
    """
    pods = get_namespaced_pods(label_selector=None, namespace='nauta')
    failed_pods = [pod for pod in pods if pod.status.phase == 'Failed']
    if failed_pods:
        click.echo("Following nauta components have failed:")
        tabulate([pod.metadata.name for pod in failed_pods], headers=["Pod name"])
        conf_path = Config().config_path
        for pod in failed_pods:
            logs = get_pod_logs(pod=pod, namespace='nauta', tail=1000)
            for i, log in enumerate(logs):
                pod_name = pod.metadata.name
                container_name = pod.status.container_statuses[i].name
                with open(f'{conf_path}/logs/{pod_name}_{container_name}.log', mode='w') as log_file:
                    log_file.writelines(log)
        click.echo('Contact Nauta administrator.')
        click.echo(f'Check logs folder in your config directory({conf_path}) to get more information.')
Пример #2
0
def test_get_namespaced_pods_error(mocker, mocked_k8s_CoreV1Api,
                                   mocked_kubeconfig):
    mocked_k8s_CoreV1Api.list_namespaced_pod.side_effect = ApiException(
        status=500)
    with pytest.raises(ApiException):
        get_namespaced_pods(label_selector='', namespace=test_namespace)
Пример #3
0
def test_get_namespaced_pods_not_found(mocker, mocked_k8s_CoreV1Api,
                                       mocked_kubeconfig):
    mocked_k8s_CoreV1Api.list_namespaced_pod.side_effect = ApiException(
        status=404)
    pods = get_namespaced_pods(label_selector='', namespace=test_namespace)
    assert pods == []
Пример #4
0
def test_get_namespaced_pods(mocker, mocked_k8s_CoreV1Api, mocked_kubeconfig):
    pods = get_namespaced_pods(label_selector='', namespace=test_namespace)
    assert pods
Пример #5
0
def view(context, state: State, experiment_name: str, tensorboard: bool,
         username: str):
    """
    Displays details of an experiment.
    """
    try:
        if username:
            namespace = username
        else:
            namespace = get_kubectl_current_context_namespace()

        run = Run.get(name=experiment_name, namespace=namespace)
        if not run:
            handle_error(user_msg=Texts.EXPERIMENT_NOT_FOUND_ERROR_MSG.format(
                experiment_name=experiment_name))
            exit(2)

        click.echo(
            tabulate([run.cli_representation],
                     headers=EXPERIMENTS_LIST_HEADERS,
                     tablefmt="orgtbl"))

        click.echo(Texts.PODS_PARTICIPATING_LIST_HEADER)

        pods = get_namespaced_pods(label_selector="runName=" + experiment_name,
                                   namespace=namespace)

        tabular_output = []
        containers_resources = []
        pending_pods = []

        for pod in pods:
            status_string = ""

            if pod.status.conditions:
                for cond in pod.status.conditions:
                    msg = "\n" if not cond.reason else "\n reason: " + \
                                                       wrap_text(cond.reason, width=POD_CONDITIONS_MAX_WIDTH)
                    msg = msg + ", \n message: " + wrap_text(cond.message, width=POD_CONDITIONS_MAX_WIDTH) \
                        if cond.message else msg
                    status_string += wrap_text(
                        cond.type + ": " + cond.status,
                        width=POD_CONDITIONS_MAX_WIDTH) + msg + "\n"
            else:
                pod_events = get_pod_events(namespace=namespace,
                                            name=pod.metadata.name)

                for event in pod_events:
                    msg = "\n" if not event.reason else "\n reason: " + \
                                                        wrap_text(event.reason, width=POD_CONDITIONS_MAX_WIDTH)
                    msg = msg + ", \n message: " + wrap_text(event.message, width=POD_CONDITIONS_MAX_WIDTH) \
                        if event.message else msg
                    status_string += msg + "\n"

            if pod.status.phase.upper() == PodStatus.PENDING.value:
                pending_pods.append(pod.metadata.name)

            container_statuses = defaultdict(lambda: None)
            if pod.status.container_statuses:
                for container_status in pod.status.container_statuses:
                    container_statuses[
                        container_status.name] = container_status.state

            container_details = []

            for container in pod.spec.containers:
                container_description = Texts.CONTAINER_DETAILS_MSG.format(
                    name=container.name,
                    status=container_status_to_msg(
                        container_statuses[container.name]),
                    volumes=container_volume_mounts_to_msg(
                        container.volume_mounts, spaces=2),
                    resources=container_resources_to_msg(container.resources,
                                                         spaces=4))
                container_details.append(container_description)
                containers_resources.append(container.resources)

            container_details = ''.join(container_details)

            tabular_output.append([
                pod.metadata.name,
                wrap_text(pod.metadata.uid, width=UID_MAX_WIDTH, spaces=0),
                status_string, container_details
            ])
        click.echo(
            tabulate(tabular_output,
                     Texts.PODS_TABLE_HEADERS,
                     tablefmt="orgtbl"))

        try:
            cpu_requests_sum = sum_cpu_resources([
                container_resource.requests["cpu"]
                for container_resource in containers_resources
                if container_resource.requests
                and container_resource.requests.get("cpu")
            ])
            mem_requests_sum = sum_mem_resources([
                container_resource.requests["memory"]
                for container_resource in containers_resources
                if container_resource.requests
                and container_resource.requests.get("memory")
            ])
            cpu_limits_sum = sum_cpu_resources([
                container_resource.limits["cpu"]
                for container_resource in containers_resources
                if container_resource.limits
                and container_resource.limits.get("cpu")
            ])
            mem_limits_sum = sum_mem_resources([
                container_resource.limits["memory"]
                for container_resource in containers_resources
                if container_resource.limits
                and container_resource.limits.get("memory")
            ])
        except ValueError as exception:
            handle_error(
                logger,
                Texts.RESOURCES_SUM_PARSING_ERROR_MSG.format(
                    error_msg=str(exception)),
                Texts.RESOURCES_SUM_PARSING_ERROR_MSG.format(
                    error_msg=str(exception)))

        click.echo(Texts.RESOURCES_SUM_LIST_HEADER)
        click.echo(
            tabulate(list(
                zip(Texts.RESOURCES_SUM_TABLE_ROWS_HEADERS, [
                    cpu_requests_sum, mem_requests_sum, cpu_limits_sum,
                    mem_limits_sum
                ])),
                     Texts.RESOURCES_SUM_TABLE_HEADERS,
                     tablefmt="orgtbl"))

        if tensorboard:
            click.echo()
            context.invoke(tensorboard_command,
                           experiment_name=[experiment_name])

        if pending_pods:
            click.echo()
            try:
                cpu = False
                memory = False
                for pod in pending_pods:
                    events_list = get_pod_events(namespace=namespace, name=pod)
                    for event in events_list:
                        if "insufficient cpu" in event.message.lower():
                            cpu = True
                        elif "insufficient memory" in event.message.lower():
                            memory = True
                        if cpu and memory:
                            break
                    if cpu and memory:
                        break

                if not cpu and not memory:
                    exit(0)

                if cpu and memory:
                    resources = "number of cpus and amount of memory"
                elif cpu:
                    resources = "number of cpus"
                else:
                    resources = "amount of memory"

                click.echo(
                    Texts.INSUFFICIENT_RESOURCES_MESSAGE.format(
                        resources=resources))
                click.echo()
                top_cpu_users, top_mem_users = get_highest_usage()
                click.echo(
                    Texts.TOP_CPU_CONSUMERS.format(consumers=", ".join([
                        res.user_name for res in
                        top_cpu_users[0:3 if len(top_cpu_users
                                                 ) > 2 else len(top_cpu_users)]
                    ])))
                click.echo(
                    Texts.TOP_MEMORY_CONSUMERS.format(consumers=", ".join([
                        res.user_name for res in
                        top_mem_users[0:3 if len(top_mem_users
                                                 ) > 2 else len(top_mem_users)]
                    ])))
            except Exception:
                click.echo(Texts.PROBLEMS_WHILE_GATHERING_USAGE_DATA)
                logger.exception(
                    Texts.PROBLEMS_WHILE_GATHERING_USAGE_DATA_LOGS)

    except Exception:
        handle_error(logger, Texts.VIEW_OTHER_ERROR_MSG,
                     Texts.VIEW_OTHER_ERROR_MSG)
        exit(1)
Пример #6
0
def launch(ctx: click.Context, name: str, model_location: str,
           local_model_location: str, model_name: str,
           pack_param: List[Tuple[str, str]], requirements: str,
           runtime: InferenceRuntime):
    """
    Starts a new prediction instance that can be used for performing prediction, classification and
    regression tasks on trained model.
    """
    if not model_location and not local_model_location:
        handle_error(user_msg=Texts.MISSING_MODEL_LOCATION_ERROR_MSG.format(
            local_model_location=local_model_location))
        exit(1)

    if local_model_location:
        validate_local_model_location(local_model_location)

    click.echo('Submitting prediction instance.')
    try:
        template = INFERENCE_TEMPLATE_OVMS if InferenceRuntime(runtime) == InferenceRuntime.OVMS else \
            INFERENCE_TEMPLATE_TFSERVING
        model_path = model_location.rstrip(
            '/') if model_location else local_model_location.rstrip('/')
        model_name = model_name if model_name else os.path.basename(model_path)
        name = name if name else generate_name(
            name=model_name, prefix=INFERENCE_INSTANCE_PREFIX)
        inference_instance = start_inference_instance(
            name=name,
            model_location=model_location,
            model_name=model_name,
            local_model_location=local_model_location,
            template=template,
            requirements=requirements,
            pack_params=pack_param)
        if inference_instance.state == RunStatus.FAILED:
            raise RuntimeError('Inference instance submission failed.')
    except Exception:
        handle_error(logger,
                     Texts.INSTANCE_START_ERROR_MSG,
                     Texts.INSTANCE_START_ERROR_MSG,
                     add_verbosity_msg=ctx.obj.verbosity == 0)
        exit(1)

    click.echo(
        tabulate([[
            inference_instance.cli_representation.name, model_location,
            inference_instance.cli_representation.status
        ]],
                 headers=Texts.TABLE_HEADERS,
                 tablefmt=TBLT_TABLE_FORMAT))

    try:
        namespace = get_kubectl_current_context_namespace()
        authorization_header = get_authorization_header(
            service_account_name=name, namespace=namespace)
        inference_instance_url = get_inference_instance_url(
            inference_instance=inference_instance, model_name=model_name)
        click.echo(
            Texts.INSTANCE_INFO_MSG.format(
                inference_instance_url=inference_instance_url,
                authorization_header=authorization_header))
    except Exception:
        handle_error(logger,
                     Texts.INSTANCE_URL_ERROR_MSG,
                     Texts.INSTANCE_URL_ERROR_MSG,
                     add_verbosity_msg=ctx.obj.verbosity == 0)
        exit(1)

    # wait till pod is ready - no more than 40 seconds
    for _ in range(40):
        pods = get_namespaced_pods(label_selector=f'runName={name}',
                                   namespace=namespace)
        if pods and all(pod.status.phase == 'Running' for pod in pods) \
                and all(container.ready for pod in pods for container in pod.status.container_statuses):
            break
        if pods and any(pod.status.phase == 'Failed' for pod in pods):
            handle_error(logger,
                         Texts.INSTANCE_START_ERROR_MSG,
                         Texts.INSTANCE_START_ERROR_MSG,
                         add_verbosity_msg=ctx.obj.verbosity == 0)
            exit(1)

        time.sleep(1)
    else:
        handle_error(logger,
                     Texts.PREDICTION_INSTANCE_NOT_READY.format(name=name),
                     Texts.PREDICTION_INSTANCE_NOT_READY.format(name=name),
                     add_verbosity_msg=ctx.obj.verbosity == 0)
        exit(0)