def check_nauta_pods(): """ Check if there are failed pods. If there are any, display a list of their names and save logs in logs directory. """ pods = get_namespaced_pods(label_selector=None, namespace='nauta') failed_pods = [pod for pod in pods if pod.status.phase == 'Failed'] if failed_pods: click.echo("Following nauta components have failed:") tabulate([pod.metadata.name for pod in failed_pods], headers=["Pod name"]) conf_path = Config().config_path for pod in failed_pods: logs = get_pod_logs(pod=pod, namespace='nauta', tail=1000) for i, log in enumerate(logs): pod_name = pod.metadata.name container_name = pod.status.container_statuses[i].name with open(f'{conf_path}/logs/{pod_name}_{container_name}.log', mode='w') as log_file: log_file.writelines(log) click.echo('Contact Nauta administrator.') click.echo(f'Check logs folder in your config directory({conf_path}) to get more information.')
def test_get_namespaced_pods_error(mocker, mocked_k8s_CoreV1Api, mocked_kubeconfig): mocked_k8s_CoreV1Api.list_namespaced_pod.side_effect = ApiException( status=500) with pytest.raises(ApiException): get_namespaced_pods(label_selector='', namespace=test_namespace)
def test_get_namespaced_pods_not_found(mocker, mocked_k8s_CoreV1Api, mocked_kubeconfig): mocked_k8s_CoreV1Api.list_namespaced_pod.side_effect = ApiException( status=404) pods = get_namespaced_pods(label_selector='', namespace=test_namespace) assert pods == []
def test_get_namespaced_pods(mocker, mocked_k8s_CoreV1Api, mocked_kubeconfig): pods = get_namespaced_pods(label_selector='', namespace=test_namespace) assert pods
def view(context, state: State, experiment_name: str, tensorboard: bool, username: str): """ Displays details of an experiment. """ try: if username: namespace = username else: namespace = get_kubectl_current_context_namespace() run = Run.get(name=experiment_name, namespace=namespace) if not run: handle_error(user_msg=Texts.EXPERIMENT_NOT_FOUND_ERROR_MSG.format( experiment_name=experiment_name)) exit(2) click.echo( tabulate([run.cli_representation], headers=EXPERIMENTS_LIST_HEADERS, tablefmt="orgtbl")) click.echo(Texts.PODS_PARTICIPATING_LIST_HEADER) pods = get_namespaced_pods(label_selector="runName=" + experiment_name, namespace=namespace) tabular_output = [] containers_resources = [] pending_pods = [] for pod in pods: status_string = "" if pod.status.conditions: for cond in pod.status.conditions: msg = "\n" if not cond.reason else "\n reason: " + \ wrap_text(cond.reason, width=POD_CONDITIONS_MAX_WIDTH) msg = msg + ", \n message: " + wrap_text(cond.message, width=POD_CONDITIONS_MAX_WIDTH) \ if cond.message else msg status_string += wrap_text( cond.type + ": " + cond.status, width=POD_CONDITIONS_MAX_WIDTH) + msg + "\n" else: pod_events = get_pod_events(namespace=namespace, name=pod.metadata.name) for event in pod_events: msg = "\n" if not event.reason else "\n reason: " + \ wrap_text(event.reason, width=POD_CONDITIONS_MAX_WIDTH) msg = msg + ", \n message: " + wrap_text(event.message, width=POD_CONDITIONS_MAX_WIDTH) \ if event.message else msg status_string += msg + "\n" if pod.status.phase.upper() == PodStatus.PENDING.value: pending_pods.append(pod.metadata.name) container_statuses = defaultdict(lambda: None) if pod.status.container_statuses: for container_status in pod.status.container_statuses: container_statuses[ container_status.name] = container_status.state container_details = [] for container in pod.spec.containers: container_description = Texts.CONTAINER_DETAILS_MSG.format( name=container.name, status=container_status_to_msg( container_statuses[container.name]), volumes=container_volume_mounts_to_msg( container.volume_mounts, spaces=2), resources=container_resources_to_msg(container.resources, spaces=4)) container_details.append(container_description) containers_resources.append(container.resources) container_details = ''.join(container_details) tabular_output.append([ pod.metadata.name, wrap_text(pod.metadata.uid, width=UID_MAX_WIDTH, spaces=0), status_string, container_details ]) click.echo( tabulate(tabular_output, Texts.PODS_TABLE_HEADERS, tablefmt="orgtbl")) try: cpu_requests_sum = sum_cpu_resources([ container_resource.requests["cpu"] for container_resource in containers_resources if container_resource.requests and container_resource.requests.get("cpu") ]) mem_requests_sum = sum_mem_resources([ container_resource.requests["memory"] for container_resource in containers_resources if container_resource.requests and container_resource.requests.get("memory") ]) cpu_limits_sum = sum_cpu_resources([ container_resource.limits["cpu"] for container_resource in containers_resources if container_resource.limits and container_resource.limits.get("cpu") ]) mem_limits_sum = sum_mem_resources([ container_resource.limits["memory"] for container_resource in containers_resources if container_resource.limits and container_resource.limits.get("memory") ]) except ValueError as exception: handle_error( logger, Texts.RESOURCES_SUM_PARSING_ERROR_MSG.format( error_msg=str(exception)), Texts.RESOURCES_SUM_PARSING_ERROR_MSG.format( error_msg=str(exception))) click.echo(Texts.RESOURCES_SUM_LIST_HEADER) click.echo( tabulate(list( zip(Texts.RESOURCES_SUM_TABLE_ROWS_HEADERS, [ cpu_requests_sum, mem_requests_sum, cpu_limits_sum, mem_limits_sum ])), Texts.RESOURCES_SUM_TABLE_HEADERS, tablefmt="orgtbl")) if tensorboard: click.echo() context.invoke(tensorboard_command, experiment_name=[experiment_name]) if pending_pods: click.echo() try: cpu = False memory = False for pod in pending_pods: events_list = get_pod_events(namespace=namespace, name=pod) for event in events_list: if "insufficient cpu" in event.message.lower(): cpu = True elif "insufficient memory" in event.message.lower(): memory = True if cpu and memory: break if cpu and memory: break if not cpu and not memory: exit(0) if cpu and memory: resources = "number of cpus and amount of memory" elif cpu: resources = "number of cpus" else: resources = "amount of memory" click.echo( Texts.INSUFFICIENT_RESOURCES_MESSAGE.format( resources=resources)) click.echo() top_cpu_users, top_mem_users = get_highest_usage() click.echo( Texts.TOP_CPU_CONSUMERS.format(consumers=", ".join([ res.user_name for res in top_cpu_users[0:3 if len(top_cpu_users ) > 2 else len(top_cpu_users)] ]))) click.echo( Texts.TOP_MEMORY_CONSUMERS.format(consumers=", ".join([ res.user_name for res in top_mem_users[0:3 if len(top_mem_users ) > 2 else len(top_mem_users)] ]))) except Exception: click.echo(Texts.PROBLEMS_WHILE_GATHERING_USAGE_DATA) logger.exception( Texts.PROBLEMS_WHILE_GATHERING_USAGE_DATA_LOGS) except Exception: handle_error(logger, Texts.VIEW_OTHER_ERROR_MSG, Texts.VIEW_OTHER_ERROR_MSG) exit(1)
def launch(ctx: click.Context, name: str, model_location: str, local_model_location: str, model_name: str, pack_param: List[Tuple[str, str]], requirements: str, runtime: InferenceRuntime): """ Starts a new prediction instance that can be used for performing prediction, classification and regression tasks on trained model. """ if not model_location and not local_model_location: handle_error(user_msg=Texts.MISSING_MODEL_LOCATION_ERROR_MSG.format( local_model_location=local_model_location)) exit(1) if local_model_location: validate_local_model_location(local_model_location) click.echo('Submitting prediction instance.') try: template = INFERENCE_TEMPLATE_OVMS if InferenceRuntime(runtime) == InferenceRuntime.OVMS else \ INFERENCE_TEMPLATE_TFSERVING model_path = model_location.rstrip( '/') if model_location else local_model_location.rstrip('/') model_name = model_name if model_name else os.path.basename(model_path) name = name if name else generate_name( name=model_name, prefix=INFERENCE_INSTANCE_PREFIX) inference_instance = start_inference_instance( name=name, model_location=model_location, model_name=model_name, local_model_location=local_model_location, template=template, requirements=requirements, pack_params=pack_param) if inference_instance.state == RunStatus.FAILED: raise RuntimeError('Inference instance submission failed.') except Exception: handle_error(logger, Texts.INSTANCE_START_ERROR_MSG, Texts.INSTANCE_START_ERROR_MSG, add_verbosity_msg=ctx.obj.verbosity == 0) exit(1) click.echo( tabulate([[ inference_instance.cli_representation.name, model_location, inference_instance.cli_representation.status ]], headers=Texts.TABLE_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) try: namespace = get_kubectl_current_context_namespace() authorization_header = get_authorization_header( service_account_name=name, namespace=namespace) inference_instance_url = get_inference_instance_url( inference_instance=inference_instance, model_name=model_name) click.echo( Texts.INSTANCE_INFO_MSG.format( inference_instance_url=inference_instance_url, authorization_header=authorization_header)) except Exception: handle_error(logger, Texts.INSTANCE_URL_ERROR_MSG, Texts.INSTANCE_URL_ERROR_MSG, add_verbosity_msg=ctx.obj.verbosity == 0) exit(1) # wait till pod is ready - no more than 40 seconds for _ in range(40): pods = get_namespaced_pods(label_selector=f'runName={name}', namespace=namespace) if pods and all(pod.status.phase == 'Running' for pod in pods) \ and all(container.ready for pod in pods for container in pod.status.container_statuses): break if pods and any(pod.status.phase == 'Failed' for pod in pods): handle_error(logger, Texts.INSTANCE_START_ERROR_MSG, Texts.INSTANCE_START_ERROR_MSG, add_verbosity_msg=ctx.obj.verbosity == 0) exit(1) time.sleep(1) else: handle_error(logger, Texts.PREDICTION_INSTANCE_NOT_READY.format(name=name), Texts.PREDICTION_INSTANCE_NOT_READY.format(name=name), add_verbosity_msg=ctx.obj.verbosity == 0) exit(0)