def get_resource_usages(self) -> List[ResourceUsage]:
     usages_copy = self.__get_usages_copy()
     if usages_copy is None:
         log.warning("No usage snapshot")
         return []
     else:
         return usages_copy
Пример #2
0
def report_cpu_event(request: AllocateRequest, response: AllocateResponse):
    event_log_manager = get_event_log_manager()
    if event_log_manager is None:
        log.warning("Event log manager is not set.")
        return

    event_log_manager.report_event(get_cpu_event(request, response))
    def _handle(self, event):
        try:
            if not self.__relevant(event):
                return

            if not managers_are_initialized():
                log.warning("Managers are not yet initialized")
                return None

            self.handling_event(event, 'oversubscribing workloads')

            with self.__window_lock:
                if datetime.utcnow() < self.__window_end_time:
                    self.__skip_count += 1
                    self.handled_event(
                        event,
                        'skipping oversubscribe - a window is currently active'
                    )
                    return

                self.__publish_window(event)

        except Exception:
            self.__fail_count += 1
            log.error(
                "Event handler: '{}' failed to handle event: '{}'".format(
                    self.__class__.__name__, event))
Пример #4
0
    def report_metrics(self, tags):
        if self.__registry is None:
            log.debug(
                "Not reporting metrics because there's no registry available yet."
            )
            return

        wm = get_workload_manager()
        if wm is None:
            log.debug(
                "Not reporting metrics because there's no workload manager available yet."
            )
            return

        pcp_usage = self.get_pcp_usage()
        if CPU_USAGE not in pcp_usage.keys():
            log.warning("No CPU usage in PCP usage.")
            return

        usage = pcp_usage[CPU_USAGE]
        static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage)
        burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage)

        self.__registry.gauge(STATIC_POOL_USAGE_KEY,
                              tags).set(static_pool_cpu_usage)
        self.__registry.gauge(BURST_POOL_USAGE_KEY,
                              tags).set(burst_pool_cpu_usage)
Пример #5
0
    def get_cpu_predictions(
            self, workloads: List[Workload],
            resource_usage: GlobalResourceUsage) -> Optional[Dict[str, float]]:
        pod_manager = get_pod_manager()
        if pod_manager is None:
            return None

        pods = []
        for w in workloads:
            pod = pod_manager.get_pod(w.get_id())
            if pod is None:
                log.warning("Failed to get pod for workload: %s", w.get_id())
            else:
                pods.append(pod)

        resource_usage_predictions = self.get_predictions(pods, resource_usage)

        predictions = {}
        if resource_usage_predictions is None:
            log.error("Got no resource usage predictions")
            return predictions
        else:
            log.info("Got resource usage predictions: %s",
                     json.dumps(resource_usage_predictions.raw))

        for w_id, prediction in resource_usage_predictions.predictions.items():
            predictions[w_id] = get_first_window_cpu_prediction(prediction)

        return predictions
Пример #6
0
    def get_predictions(
        self, running_pods: List[V1Pod], resource_usage: GlobalResourceUsage
    ) -> Optional[ResourceUsagePredictions]:
        config_manager = get_config_manager()
        if config_manager is None:
            log.warning("Config manager not yet set.")
            return None

        client_crt = get_client_cert_path(config_manager)
        client_key = get_client_key_path(config_manager)
        if client_crt is None or client_key is None:
            log.error("Failed to generate credential paths")
            return None

        url = get_url(config_manager)
        if url is None:
            log.error("Unable to generate prediction service url")
            return None

        body = self.__get_body(running_pods, resource_usage)
        if body is None:
            log.error("Unable to generate a prediction request body")
            return None

        predictions = get_predictions(client_crt, client_key, url, body)
        if predictions is None:
            log.error("Failed to get predictions")
            return None

        return ResourceUsagePredictions(predictions)
    def report_metrics(self, tags):
        if self.__registry is None:
            log.debug("Not reporting metrics because there's no registry available yet.")
            return

        wm = get_workload_manager()
        if wm is None:
            log.debug("Not reporting metrics because there's no workload manager available yet.")
            return

        workload_ids = wm.get_workload_map_copy().keys()
        usage_dict = self.__get_usage_dict(workload_ids)
        if CPU_USAGE not in usage_dict.keys():
            log.warning("No CPU usage in usage: %s", usage_dict)
            return

        usage = usage_dict[CPU_USAGE]
        static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage)
        burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage)

        self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage)
        self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage)

        with self.__metric_lock:
            self.__registry.counter(GET_RESOURCE_USAGE_FAILURE, tags).increment(self.__get_resource_usage_failure_count)
            self.__get_resource_usage_failure_count = 0
    def __get_simple_cpu_predictions(self) -> Dict[str, float]:
        cpu_predictor = self.__cpu_usage_predictor_manager.get_cpu_predictor()
        if cpu_predictor is None:
            log.error("Failed to get cpu predictor")
            return {}

        workloads = self.__workload_manager.get_workloads()
        if len(workloads) == 0:
            log.warning("No workloads, skipping cpu usage prediction")
            return {}

        workload_ids = [w.get_id() for w in workloads]
        resource_usage = self.__workload_monitor_manager.get_resource_usage(
            workload_ids)

        log.info("Getting simple cpu predictions...")
        cpu_predictions = cpu_predictor.get_cpu_predictions(
            workloads, resource_usage)
        if cpu_predictions is None:
            log.error("Failed to get cpu predictions")
            return {}
        else:
            log.info("Got simple cpu predictions: %s",
                     json.dumps(cpu_predictions))
            return cpu_predictions
    def get_cpu_predictions(
            self, workloads: List[Workload],
            resource_usage: GlobalResourceUsage) -> Optional[Dict[str, float]]:
        pods = []
        for w in workloads:
            if w.get_object_type() is not KubernetesWorkload:
                log.warning(
                    "Cannot predict non Kubernetes workload %s: %s is not %s",
                    w.get_id(), w.get_object_type(), KubernetesWorkload)
                continue

            pods.append(w.get_pod())

        resource_usage_predictions = self.get_predictions(pods, resource_usage)

        predictions = {}
        if resource_usage_predictions is None:
            log.error("Got no resource usage predictions")
            return predictions
        else:
            log.info("Got resource usage predictions: %s",
                     json.dumps(resource_usage_predictions.raw))

        for w_id, prediction in resource_usage_predictions.predictions.items():
            predictions[w_id] = get_first_window_cpu_prediction(prediction)

        return predictions
Пример #10
0
def __has_required_labels(container):
    for l in REQUIRED_LABELS:
        if l not in container.labels:
            log.warning(
                "Found running workload: '{}' without expected label: '{}'".
                format(container.name, l))
            return False

    return True
    def __get_usage_path(self):
        if self.__usage_path is not None:
            return self.__usage_path

        try:
            self.__usage_path = get_usage_all_path(self.__workload.get_id())
        except FileNotFoundError:
            log.warning("No cpu usage path for workload: '{}'".format(
                self.__workload.get_id()))

        return self.__usage_path
Пример #12
0
    def __get_address(self) -> Optional[str]:
        config_manager = get_config_manager()
        region = config_manager.get_region()
        env = config_manager.get_environment()
        format_str = config_manager.get_str(EVENT_LOG_FORMAT_STR)
        if format_str is None:
            log.warning("Keystone is not enabled in this region env: %s %s", region, env)
            return None

        stream = 'titus_isolate'
        return format_str.format(region, env, stream)
Пример #13
0
def get_grpc_cell_name(config_manager):
    endpoint = config_manager.get_cached_str(GRPC_REMOTE_ALLOC_ENDPOINT, None)
    if endpoint is None:
        log.warning("Could not get grpc remote allocator endpoint address.")
        return UNKNOWN_CELL
    stub = IsolationServiceStub(grpc.insecure_channel(endpoint))
    res = stub.GetCurrentCell(CurrentCellRequest(), timeout=5.0)
    if res.cell_id == "":
        log.warning("Service returned empty grpc cell header")
        return UNKNOWN_CELL
    return res.cell_id
Пример #14
0
def get_http_cell_name(config_manager):
    url = config_manager.get_cached_str(REMOTE_ALLOCATOR_URL)
    if url is None:
        log.warning("No remote solver URL specified.")
        return UNKNOWN_CELL

    timeout = config_manager.get_cached_int(MAX_SOLVER_RUNTIME,
                                            DEFAULT_MAX_SOLVER_RUNTIME)

    response = requests.get(url, timeout=timeout)
    cell_name = response.headers.get(TITUS_ISOLATE_CELL_HEADER, None)
    if cell_name is None:
        log.warning("Titus isolation cell header is not set.")
        return UNKNOWN_CELL
    return cell_name
    def get_cpu_usage(self):
        usage_path = self.__get_usage_path()
        if usage_path is None:
            return None

        if not os.path.isfile(usage_path):
            log.warning("cpu usage path does not exist: {}".format(usage_path))
            return

        with open(usage_path, 'r') as f:
            timestamp = datetime.datetime.utcnow()
            content = f.read()

        cpu_usage_rows = parse_cpuacct_usage_all(content)
        return CpuUsageSnapshot(timestamp, cpu_usage_rows)
Пример #16
0
    def __init__(self, raw: dict):
        self.raw = raw
        self.model_version = raw.get(MODEL_VERSION, "UNKNOWN_MODEL_VERSION")
        self.model_instance_id = raw.get(MODEL_INSTANCE_ID,
                                         "UNKNOWN_MODEL_INSTANCE_ID")
        self.prediction_ts_ms = int(raw.get(PREDICTION_TS_MS, '0'))
        self.metadata = raw.get(META_DATA, {})

        self.predictions = {}
        raw_predictions = raw.get(PREDICTIONS)
        if raw_predictions is not None:
            for p in raw_predictions:
                job_id = p.get(JOB_ID, "UNKNOWN_JOB_ID")
                self.predictions[job_id] = ResourceUsagePrediction(p)
        else:
            log.warning("No predictions present")
Пример #17
0
    def get_free_threads(self,
                         cpu: Cpu,
                         workload_map: Dict[str, Workload],
                         cpu_usage: Dict[str, float] = None) -> List[Thread]:

        if cpu_usage is None:
            log.warning(
                "CPU usage is required, defaulting to EMPTY threads being free."
            )
            return cpu.get_empty_threads()

        free_threads = []
        for c in get_free_cores(self.__threshold, cpu, workload_map,
                                cpu_usage):
            free_threads += c.get_threads()

        return free_threads
Пример #18
0
    def __validate_prom_response(resp) -> bool:
        if "data" not in resp:
            log.error(
                "Unexpected Prometheus response.  No 'data' field in response")
            return False

        data = resp["data"]
        if "result" not in data:
            log.error(
                "Unexpected Prometheus response.  No 'result' field in data")
            return False

        result = data["result"]
        if len(result) == 0:
            log.warning("Empty result returned by Prometheus")
            return False

        return True
    def rebalance(self, request: AllocateRequest) -> AllocateResponse:
        self.__call_meta = {}
        cpu = request.get_cpu()
        cpu_usage = request.get_cpu_usage()
        workloads = request.get_workloads()
        self.__cnt_rebalance_calls += 1

        if len(workloads) == 0:
            log.warning("Ignoring rebalance of empty CPU.")
            self.__call_meta['rebalance_empty'] = 1
            return AllocateResponse(cpu, self.get_name(), self.__call_meta)

        log.info("Rebalancing with predictions...")
        curr_ids_per_workload = cpu.get_workload_ids_to_thread_ids()

        return AllocateResponse(
            self.__compute_allocation(cpu, None, workloads, curr_ids_per_workload, cpu_usage, None),
            self.get_name(),
            self.__call_meta)
Пример #20
0
def get_current_workloads(docker_client):
    workloads = []
    for container in docker_client.containers.list():
        workload_id = container.name
        if __has_required_labels(container):
            try:
                labels = container.labels
                cpu = int(__get_value(labels, CPU_LABEL_KEY, -1))
                mem = int(__get_value(labels, MEM_LABEL_KEY, -1))
                disk = int(__get_value(labels, DISK_LABEL_KEY, -1))
                network = int(__get_value(labels, DISK_LABEL_KEY, -1))
                app_name = __get_value(labels, APP_NAME_LABEL_KEY)
                owner_email = __get_value(labels, OWNER_EMAIL_LABEL_KEY)
                command = __get_value(labels, COMMAND_LABEL_KEY)
                entrypoint = __get_value(labels, ENTRYPOINT_LABEL_KEY)
                job_type = __get_value(labels, JOB_TYPE_LABEL_KEY)
                workload_type = __get_value(labels, WORKLOAD_TYPE_LABEL_KEY)
                image = __get_image(container)

                workloads.append(
                    Workload(identifier=workload_id,
                             thread_count=cpu,
                             mem=mem,
                             disk=disk,
                             network=network,
                             app_name=app_name,
                             owner_email=owner_email,
                             image=image,
                             command=command,
                             entrypoint=entrypoint,
                             job_type=job_type,
                             workload_type=workload_type))
                log.info("Found running workload: '{}'".format(workload_id))
            except:
                log.exception(
                    "Failed to parse labels for container: '{}'".format(
                        container.name))
        else:
            log.warning(
                "Found running workload: '{}' without expected labels'")

    return workloads
Пример #21
0
def get_current_workloads(docker_client):
    workloads = []
    for container in docker_client.containers.list():
        workload_id = container.name
        if __has_required_labels(container):
            try:
                cpu = int(container.labels[CPU_LABEL_KEY])
                workload_type = container.labels[WORKLOAD_TYPE_LABEL_KEY]
                workloads.append(Workload(workload_id, cpu, workload_type))
                log.info("Found running workload: '{}'".format(workload_id))
            except:
                log.exception(
                    "Failed to parse labels for container: '{}'".format(
                        container.name))
        else:
            log.warning(
                "Found running workload: '{}' without expected label: '{}'".
                format(workload_id, CPU_LABEL_KEY))

    return workloads
Пример #22
0
    def __process_events(self):
        while True:
            try:
                msg = self.__q.get()
                if not self.__enabled:
                    log.warning("Dropping keystone event because keystone is disabled")
                    continue

                log.debug("Sending event log message: {}".format(msg))
                response = send_event_msg(msg, self.__address)

                if response.status_code != 200:
                    log.error("Re-enqueuing failed event log message: {}".format(response.content))
                    self.__retry_msg_count += 1
                    self.__q.put_nowait(msg)
                else:
                    self.__succeeded_msg_count += 1
            except Exception:
                self.__failed_msg_count += 1
                log.error("Failed to process event log message.")
Пример #23
0
def get_cell_name():
    config_manager = get_config_manager()
    if config_manager is None:
        log.warning("Config manager is not yet set.")
        return UNKNOWN_CELL

    if not is_primary_allocator_grpc(config_manager):
        log.info("Fetching cell for an http allocator")
        fetch_fun = get_http_cell_name
    else:
        log.info("Fetching cell for the grpc allocator")
        fetch_fun = get_grpc_cell_name

    try:
        cell_name = fetch_fun(config_manager)
        log.info("Cell: %s", cell_name)
        return cell_name
    except Exception:
        log.error("Failed to determine isolation cell.")
        return UNKNOWN_CELL
    def get_cpu_predictions(self, workloads: List[Workload], resource_usage: GlobalResourceUsage) \
            -> Optional[Dict[str, float]]:

        config_manager = get_config_manager()
        if config_manager is None:
            log.warning("Config manager is not yet set")
            return {}

        cpu_usage = resource_usage.get_cpu_usage()
        if cpu_usage is None:
            log.warning("No cpu usage")
            return {}
        pred_env = PredEnvironment(config_manager.get_region(),
                                   config_manager.get_environment(),
                                   datetime.utcnow().hour)

        predictions = {}
        for workload in workloads:
            workload_cpu_usage = cpu_usage.get(workload.get_id(), None)
            if workload_cpu_usage is None:
                log.warning("No CPU usage for workload: %s", workload.get_id())
                continue

            workload_cpu_usage = [float(u) for u in workload_cpu_usage]
            pred_cpus = self.predict(workload, workload_cpu_usage, pred_env)
            predictions[workload.get_id()] = pred_cpus

        return predictions
Пример #25
0
def get_cell_name():
    config_manager = get_config_manager()
    if config_manager is None:
        log.warning("Config manager is not yet set.")
        return UNKNOWN_CELL

    url = config_manager.get_str(REMOTE_ALLOCATOR_URL)
    if url is None:
        log.warning("No remote solver URL specified.")
        return UNKNOWN_CELL

    timeout = config_manager.get_int(MAX_SOLVER_RUNTIME,
                                     DEFAULT_MAX_SOLVER_RUNTIME)

    try:
        response = requests.get(url, timeout=timeout)
        cell_name = response.headers.get(TITUS_ISOLATE_CELL_HEADER, None)
        if cell_name is None:
            log.warning("Titus isolation cell header is not set.")
            return UNKNOWN_CELL
        else:
            return cell_name
    except:
        log.exception("Failed to determine isolation cell.")
        return UNKNOWN_CELL
    def __watch(self):
        label_selector = "{}={}".format(
            OPPORTUNISTIC_RESOURCE_NODE_NAME_LABEL_KEY, self.__node_name)
        log.info("Starting opportunistic resource watch...")
        stream = None
        try:
            stream = watch.Watch().stream(
                self.__custom_api.list_cluster_custom_object,
                group="titus.netflix.com",
                version="v1",
                plural="opportunistic-resources",
                label_selector=label_selector)

            for event in stream:
                log.info("Event: %s", event)
                if self.__is_expired_error(event):
                    raise Exception("Opportunistic resource expired")

                event_type = event['type']
                if event_type not in HANDLED_EVENTS:
                    log.warning("Ignoring unhandled event: %s", event)
                    continue

                event_metadata_name = event['object']['metadata']['name']
                with self.__lock:
                    if event_type == ADDED:
                        self.__opportunistic_resources[
                            event_metadata_name] = event
                    elif event_type == DELETED:
                        self.__opportunistic_resources.pop(
                            event_metadata_name, None)

        except Exception:
            if stream is not None:
                stream.close()
            log.exception("Watch of opportunistic resources failed")
            self.__exit_handler.exit(OPPORTUNISTIC_WATCH_FAILURE)
    def get_predictions(
        self, pods: List[V1Pod], resource_usage: GlobalResourceUsage
    ) -> Optional[ResourceUsagePredictions]:
        config_manager = get_config_manager()
        if config_manager is None:
            log.warning("Config manager not yet set.")
            return None

        running_pods = []
        for p in pods:
            if p.metadata.name in resource_usage.get_workload_ids():
                running_pods.append(p)
            else:
                log.info("Pod is not yet running: %s", p.metadata.name)

        client_crt = get_client_cert_path(config_manager)
        client_key = get_client_key_path(config_manager)
        if client_crt is None or client_key is None:
            log.error("Failed to generate credential paths")
            return None

        url = get_url(config_manager)
        if url is None:
            log.error("Unable to generate prediction service url")
            return None

        body = self.__get_body(running_pods, resource_usage)
        if body is None:
            log.error("Unable to generate a prediction request body")
            return None

        predictions = get_predictions(client_crt, client_key, url, body)
        if predictions is None:
            log.error("Failed to get predictions")
            return None

        return ResourceUsagePredictions(predictions)
Пример #28
0
    def reconcile(self, cpu: Cpu):
        if self.__cgroup_manager.has_pending_work():
            log.warning(
                "Skipping reconciliation as some isolation work is still pending."
            )
            self.__skip_count += 1
            return

        workloads = self.get_workloads(cpu)
        for w_id, t_ids in workloads.items():
            cpuset = sorted(self.__cgroup_manager.get_cpuset(w_id))
            t_ids = sorted(t_ids)

            if cpuset != t_ids:
                log.error(
                    "Reconciliation has failed for workload: '{}', cpuset: {} != t_ids: {}"
                    .format(w_id, cpuset, t_ids))
                self.__exit_handler.exit(RECONCILIATION_FAILURE_EXIT)
            else:
                log.info(
                    "Reconciliation has succeeded for workload: '{}', cpuset: {} == t_ids: {}"
                    .format(w_id, cpuset, t_ids))

        self.__success_count += 1
    def __publish_window(self, event):
        # we calculate the window before we send the request to ensure we're not going over our 10 minute mark
        start = datetime.utcnow()
        end = start + timedelta(minutes=self.__config_manager.get_int(
            OVERSUBSCRIBE_WINDOW_SIZE_MINUTES_KEY,
            DEFAULT_OVERSUBSCRIBE_WINDOW_SIZE_MINUTES))

        simple_cpu_usage_predictions = self.__get_simple_cpu_predictions()

        workload_count = 0
        underutilized_cpu_count = 0

        for workload in self.__workload_manager.get_workloads():
            log.info('workload:%s job_type:%s cpu:%d', workload.get_app_name(),
                     workload.get_job_type(), workload.get_thread_count())

            if not self.__is_long_enough(workload):
                continue

            simple_cpu_prediction = simple_cpu_usage_predictions.get(
                workload.get_id(), None)
            if simple_cpu_prediction is None:
                log.warning("No CPU prediction for workload: %s",
                            workload.get_id())
                continue

            # Process prediction
            pred_usage = simple_cpu_prediction / workload.get_thread_count()
            threshold = self.__config_manager.get_float(
                TOTAL_THRESHOLD, DEFAULT_TOTAL_THRESHOLD)

            log.info(
                "Testing oversubscribability of workload: {}, threshold: {}, prediction: {}"
                .format(workload.get_id(), threshold, pred_usage))

            if pred_usage > threshold:
                log.info("Workload: %s is NOT oversubscribable: %s",
                         workload.get_id(), pred_usage)
                continue

            log.info("Workload: %s is oversubscribable: %s", workload.get_id(),
                     pred_usage)

            if workload.is_opportunistic():
                # only add the number of "real" threads (non-opportunistic)
                free = workload.get_thread_count(
                ) - workload.get_opportunistic_thread_count()
                if free <= 0:
                    continue
                underutilized_cpu_count += free
            else:
                underutilized_cpu_count += workload.get_thread_count()
            workload_count += 1

        free_cpu_count = underutilized_cpu_count
        if free_cpu_count > 0:
            self.__window_publisher.add_window(start, end, free_cpu_count)
            self.__window_end_time = end

        self.__success_count += 1
        self.handled_event(
            event,
            'oversubscribed {} cpus from {} workloads, {} total cpus are oversubscribed'
            .format(free_cpu_count, workload_count, underutilized_cpu_count))
    def publish(self):
        log.info("Predicting resource usage")

        allocated_resources = Resources()
        num_batch_containers = 0
        num_service_containers = 0
        if len(self.__pod_manager.get_pods()) == 0:
            log.warning("No pods, skipping resource usage prediction")
            predictions = ResourceUsagePredictions({})
            predictions.set_prediction_ts_ms(
                1000 * int(time.mktime(dt.utcnow().timetuple())))
        else:
            running_pods = [
                p for p in self.__pod_manager.get_pods()
                if self.__resource_usage_predictor.is_running(p)
            ]
            try:
                allocated_resources = self.__compute_allocated_resources(
                    running_pods)
            except Exception as e:
                self.__parse_pod_req_resources_fail_count += 1
                log.error(
                    "Failed to parse pod requested resources. Aborting: %s", e)
                raise e
            workload_ids = [p.metadata.name for p in running_pods]
            predictions = self.__resource_usage_predictor.get_predictions(
                running_pods, self.__wmm.get_resource_usage(workload_ids))
            num_batch_containers, num_service_containers = self.__compute_num_containers(
                running_pods)

        node = get_node()
        log.debug('owner_kind:%s owner_name:%s owner_uid:%s', node.kind,
                  node.metadata.name, node.metadata.uid)
        instance_type = get_instance_type(node)

        condensed_predictions = CondensedResourceUsagePrediction(
            predictions, allocated_resources, instance_type,
            num_batch_containers, num_service_containers,
            self.__resources_capacity)

        object_name = "{}".format(node.metadata.name)
        metadata = V1ObjectMeta(
            namespace=PREDICTED_RESOURCE_USAGE_NAMESPACE,
            name=object_name,
            labels={
                PREDICTED_RESOURCE_USAGE_NODE_NAME_LABEL_KEY:
                node.metadata.name,
                PREDICTED_RESOURCE_USAGE_NODE_UID_LABEL_KEY: node.metadata.uid
            },
            owner_references=[
                V1OwnerReference(api_version=node.api_version,
                                 kind=node.kind,
                                 name=node.metadata.name,
                                 uid=node.metadata.uid)
            ])
        body = ResourceUsagePredictionsResource(metadata=metadata,
                                                spec=condensed_predictions)

        obj = "UNINITIALIZED_RESOURCE_PREDICTION_OBJECT"
        try:
            obj = self.__custom_api.patch_namespaced_custom_object(
                version=PREDICTED_USAGE_RESOURCE_VERSION,
                group=CUSTOM_RESOURCE_GROUP,
                plural=PREDICTED_RESOURCE_USAGE_PLURAL,
                namespace=PREDICTED_RESOURCE_USAGE_NAMESPACE,
                name=object_name,
                body=body)
        except ApiException as e:
            log.info("ApiException status: %s", e.status)
            if e.status == 404:
                obj = self.__custom_api.create_namespaced_custom_object(
                    version=PREDICTED_USAGE_RESOURCE_VERSION,
                    group=CUSTOM_RESOURCE_GROUP,
                    plural=PREDICTED_RESOURCE_USAGE_PLURAL,
                    namespace=PREDICTED_RESOURCE_USAGE_NAMESPACE,
                    body=body)
            else:
                log.error("Encountered unexpected API exception reason")

        log.info('predicted resource usage: %s', json.dumps(obj))