Пример #1
0
    def __remove_workload(self, workload_id):
        log.info("Removing workload: {}".format(workload_id))
        if workload_id not in self.__workloads:
            log.error("Attempted to remove unknown workload: '{}'".format(
                workload_id))
            return

        workload_map = self.get_workload_map_copy()

        request = self.__get_threads_request(workload_id, workload_map, "free")
        response = self.__cpu_allocator.free_threads(request)

        workload_map.pop(workload_id)
        self.__update_state(response, workload_map)
        report_cpu_event(request, response)
Пример #2
0
    def __init__(self):
        self.__address = self.__get_address()
        log.info("Set keystone address to: {}".format(self.__address))

        self.__enabled = self.__address is not None

        self.__q = Queue()

        self.__reg = None
        self.__succeeded_msg_count = 0
        self.__retry_msg_count = 0
        self.__failed_msg_count = 0

        self.__processing_thread = Thread(target=self.__process_events)
        self.__processing_thread.start()
    def __init__(self, free_thread_provider):
        config_manager = get_config_manager()

        self.__url = config_manager.get_str(REMOTE_ALLOCATOR_URL,
                                            "http://localhost:7501")
        solver_max_runtime_secs = config_manager.get_float(
            MAX_SOLVER_RUNTIME, DEFAULT_MAX_SOLVER_RUNTIME)
        solver_max_connect_secs = config_manager.get_float(
            MAX_SOLVER_CONNECT_SEC, DEFAULT_MAX_SOLVER_CONNECT_SEC)
        self.__timeout = (solver_max_connect_secs, solver_max_runtime_secs)
        self.__headers = {'Content-Type': "application/json"}
        self.__reg = None

        log.info("remote allocator max_connect_secs: %d, max_runtime_secs: %d",
                 solver_max_connect_secs, solver_max_runtime_secs)
Пример #4
0
def free_threads():
    try:
        body = request.get_json()
        log.info("Processing free threads request: {}".format(body))
        threads_request = get_threads_request(body)
        response = get_free_cpu_allocator().free_threads(threads_request)

        global free_threads_success_count
        free_threads_success_count += 1

        return jsonify(response.to_dict())
    except:
        log.exception("Failed to free threads")
        global free_threads_failure_count
        free_threads_failure_count += 1
        return "Failed to free threads", 500
Пример #5
0
    def test_assign_one_workload_empty_cpu(self):
        cpu = get_cpu()
        self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT,
                         len(cpu.get_empty_threads()))

        w = get_test_workload(uuid.uuid4(), 1, STATIC)
        request = get_no_usage_threads_request(cpu, [w])
        cpu = noop_reset_allocator.assign_threads(request).get_cpu()
        log.info(cpu)
        self.assertEqual(0, len(cpu.get_empty_threads()))
        self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT,
                         len(cpu.get_claimed_threads()))

        for t in cpu.get_threads():
            self.assertEqual(1, len(t.get_workload_ids()))
            self.assertEqual(w.get_id(), t.get_workload_ids()[0])
Пример #6
0
    def __watch(self):
        while True:
            try:
                instance_id = get_config_manager().get_str("EC2_INSTANCE_ID")
                field_selector = "spec.nodeName={}".format(instance_id)
                log.info("Watching pods with field selector: %s",
                         field_selector)

                v1 = client.CoreV1Api()
                w = watch.Watch()

                for event in w.stream(v1.list_pod_for_all_namespaces,
                                      field_selector=field_selector):
                    self.__handle_event(event)
            except:
                log.exception("pod watch thread failed")
Пример #7
0
def rebalance():
    try:
        body = request.get_json()
        log.info("Processing rebalance threads request: {}".format(body))
        rebalance_request = get_rebalance_request(body)
        response = get_rebalance_cpu_allocator().rebalance(rebalance_request)

        global rebalance_success_count
        rebalance_success_count += 1

        return jsonify(response.to_dict())
    except:
        log.exception("Failed to rebalance")
        global rebalance_failure_count
        rebalance_failure_count += 1
        return "Failed to rebalance", 500
Пример #8
0
    def test_assign_more_than_available_threads_with_one_workload(self):
        for allocator in OVER_ALLOCATORS:
            cpu = get_cpu()
            w_jumbo = get_test_workload("jumbo",
                                        DEFAULT_TOTAL_THREAD_COUNT * 1.5,
                                        STATIC)

            request = AllocateThreadsRequest(cpu, w_jumbo.get_id(),
                                             {w_jumbo.get_id(): w_jumbo}, {},
                                             DEFAULT_TEST_REQUEST_METADATA)
            cpu = allocator.assign_threads(request).get_cpu()
            log.info(cpu)

            self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT,
                             len(cpu.get_claimed_threads()))
            self.assertEqual([w_jumbo.get_id()],
                             list(cpu.get_workload_ids_to_thread_ids().keys()))
    def __is_long_enough(self, workload) -> bool:
        min_duration_sec = 60 * self.__config_manager.get_int(
            OVERSUBSCRIBE_WINDOW_SIZE_MINUTES_KEY,
            DEFAULT_OVERSUBSCRIBE_WINDOW_SIZE_MINUTES)
        workload_duration_sec = self.__get_workload_duration(
            workload, min_duration_sec)
        if workload_duration_sec < min_duration_sec:
            log.info(
                "Workload: {} is too short. workload_duration_sec: {} < min_duration_sec: {}"
                .format(workload.get_id(), workload_duration_sec,
                        min_duration_sec))
            return False

        log.info(
            "Workload: {} is long enough. workload_duration_sec: {} >= min_duration_sec: {}"
            .format(workload.get_id(), workload_duration_sec,
                    min_duration_sec))
        return True
    def __init__(self,
                 relative_start_sec: int,
                 interval_sec: int,
                 sample_interval_sec: int = DEFAULT_SAMPLE_FREQUENCY_SEC,
                 query_timeout_sec: int = DEFAULT_METRICS_QUERY_TIMEOUT_SEC):

        self.__relative_start_sec = relative_start_sec
        self.__interval_sec = interval_sec
        self.__query_timeout_sec = query_timeout_sec
        self.__interval_count = int(relative_start_sec / interval_sec)
        self.__usages = None
        self.__lock = Lock()
        self.__snapshot_usage_raw()

        log.info("Scheduling pcp metrics collecting every {} seconds".format(
            sample_interval_sec))
        schedule.every(sample_interval_sec).seconds.do(
            self.__snapshot_usage_raw)
Пример #11
0
    def __init__(
            self,
            config_manager: ConfigManager,
            exit_handler: ExitHandler,
            properties: List[str],
            detection_interval: int = PROPERTY_CHANGE_DETECTION_INTERVAL_SEC):

        self.__config_manager = config_manager
        self.__exit_handler = exit_handler
        self.__properties = properties

        log.info("Starting watching for changes to properties: {}".format(
            properties))
        for p in properties:
            v = config_manager.get_cached_str(p)
            log.info("{}: {}".format(p, v))

        schedule.every(detection_interval).seconds.do(self.detect_changes)
    def rebalance(self, request: AllocateRequest) -> AllocateResponse:
        self.__call_meta = {}
        cpu = request.get_cpu()
        cpu_usage = request.get_cpu_usage()
        workloads = request.get_workloads()
        self.__cnt_rebalance_calls += 1

        if len(workloads) == 0:
            log.warning("Ignoring rebalance of empty CPU.")
            self.__call_meta['rebalance_empty'] = 1
            return AllocateResponse(cpu, self.get_name(), self.__call_meta)

        log.info("Rebalancing with predictions...")
        curr_ids_per_workload = cpu.get_workload_ids_to_thread_ids()

        return AllocateResponse(
            self.__compute_allocation(cpu, None, workloads, curr_ids_per_workload, cpu_usage, None),
            self.get_name(),
            self.__call_meta)
Пример #13
0
def rebalance():
    try:
        request_ip = request.headers.get(FORWARDED_FOR_HEADER)
        log.info("Processing rebalance threads request (from, proxy): {}".format(request_ip))

        body = request.get_json()
        rebalance_request = get_rebalance_request(body)
        response = get_rebalance_cpu_allocator().rebalance(rebalance_request)

        global rebalance_success_count
        rebalance_success_count += 1

        log.info("Processed rebalance threads request (from, proxy): {}".format(request_ip))
        return jsonify(response.to_dict())
    except:
        log.exception("Failed to rebalance")
        global rebalance_failure_count
        rebalance_failure_count += 1
        return "Failed to rebalance", 500
Пример #14
0
def get_current_workloads(docker_client):
    workloads = []
    for container in docker_client.containers.list():
        workload_id = container.name
        if __has_required_labels(container):
            try:
                labels = container.labels
                cpu = int(__get_value(labels, CPU_LABEL_KEY, -1))
                mem = int(__get_value(labels, MEM_LABEL_KEY, -1))
                disk = int(__get_value(labels, DISK_LABEL_KEY, -1))
                network = int(__get_value(labels, DISK_LABEL_KEY, -1))
                app_name = __get_value(labels, APP_NAME_LABEL_KEY)
                owner_email = __get_value(labels, OWNER_EMAIL_LABEL_KEY)
                command = __get_value(labels, COMMAND_LABEL_KEY)
                entrypoint = __get_value(labels, ENTRYPOINT_LABEL_KEY)
                job_type = __get_value(labels, JOB_TYPE_LABEL_KEY)
                workload_type = __get_value(labels, WORKLOAD_TYPE_LABEL_KEY)
                image = __get_image(container)

                workloads.append(
                    Workload(identifier=workload_id,
                             thread_count=cpu,
                             mem=mem,
                             disk=disk,
                             network=network,
                             app_name=app_name,
                             owner_email=owner_email,
                             image=image,
                             command=command,
                             entrypoint=entrypoint,
                             job_type=job_type,
                             workload_type=workload_type))
                log.info("Found running workload: '{}'".format(workload_id))
            except:
                log.exception(
                    "Failed to parse labels for container: '{}'".format(
                        container.name))
        else:
            log.warning(
                "Found running workload: '{}' without expected labels'")

    return workloads
Пример #15
0
def get_current_workloads(docker_client):
    workloads = []
    for container in docker_client.containers.list():
        workload_id = container.name
        if __has_required_labels(container):
            try:
                cpu = int(container.labels[CPU_LABEL_KEY])
                workload_type = container.labels[WORKLOAD_TYPE_LABEL_KEY]
                workloads.append(Workload(workload_id, cpu, workload_type))
                log.info("Found running workload: '{}'".format(workload_id))
            except:
                log.exception(
                    "Failed to parse labels for container: '{}'".format(
                        container.name))
        else:
            log.warning(
                "Found running workload: '{}' without expected label: '{}'".
                format(workload_id, CPU_LABEL_KEY))

    return workloads
Пример #16
0
    def __get_simple_cpu_predictions(self) -> Dict[str, float]:
        cpu_predictor = self.__cpu_usage_predictor_manager.get_cpu_predictor()
        if cpu_predictor is None:
            log.error("Failed to get cpu predictor")
            return {}

        workloads = self.workload_manager.get_workloads()
        resource_usage = GlobalResourceUsage(
            self.__workload_monitor_manager.get_pcp_usage())

        log.info("Getting simple cpu predictions...")
        cpu_predictions = cpu_predictor.get_cpu_predictions(
            workloads, resource_usage)
        if cpu_predictions is None:
            log.error("Failed to get cpu predictions")
            return {}
        else:
            log.info("Got simple cpu predictions: %s",
                     json.dumps(cpu_predictions))
            return cpu_predictions
Пример #17
0
    def test_assign_one_thread_empty_cpu(self):
        """
        Workload 0: 1 thread --> (p:0 c:0 t:0)
        """
        for allocator in ALLOCATORS:
            cpu = get_cpu()
            self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT,
                             len(cpu.get_empty_threads()))

            w = get_test_workload(uuid.uuid4(), 1, STATIC)

            request = get_no_usage_threads_request(cpu, [w])
            cpu = allocator.assign_threads(request).get_cpu()
            log.info(cpu)
            self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT - 1,
                             len(cpu.get_empty_threads()))
            self.assertEqual(1, len(cpu.get_claimed_threads()))
            self.assertEqual(
                w.get_id(),
                cpu.get_claimed_threads()[0].get_workload_ids()[0])
Пример #18
0
    def __init__(self, cpu: Cpu, cgroup_manager: CgroupManager, cpu_allocator: CpuAllocator):

        self.__reg = None
        self.__lock = Lock()
        self.__instance_id = get_config_manager().get_str(EC2_INSTANCE_ID)

        self.__cpu_allocator = cpu_allocator

        self.__error_count = 0
        self.__added_count = 0
        self.__removed_count = 0
        self.__rebalanced_count = 0
        self.__added_to_full_cpu_count = 0
        self.__allocator_call_duration_sum_secs = 0

        self.__cpu = cpu
        self.__cgroup_manager = cgroup_manager
        self.__wmm = get_workload_monitor_manager()
        self.__workloads = {}

        log.info("Created workload manager")
Пример #19
0
def get_workload_from_kubernetes(identifier) -> Optional[KubernetesWorkload]:
    if not managers_are_initialized():
        log.error(
            "Cannot get workload from kubernetes because managers aren't initialized"
        )
        return None

    retry_count = get_config_manager().get_int(
        GET_WORKLOAD_RETRY_COUNT, DEFAULT_GET_WORKLOAD_RETRY_COUNT)
    retry_interval = get_config_manager().get_float(
        GET_WORKLOAD_RETRY_INTERVAL_SEC,
        DEFAULT_GET_WORKLOAD_RETRY_INTERVAL_SEC)

    pod_manager = get_pod_manager()
    for i in range(retry_count):
        log.info("Getting pod from kubernetes: %s", identifier)
        pod = pod_manager.get_pod(identifier)
        if pod is not None:
            log.info("Got pod from kubernetes: %s", identifier)
            return KubernetesWorkload(pod)

        log.info("Retrying getting pod from kubernetes in %s seconds",
                 retry_interval)
        time.sleep(retry_interval)

    log.error("Failed to get pod from kubernetes: %s", identifier)
    return None
Пример #20
0
    def test_external_cpu_manipulation(self):
        cpu = get_cpu()
        violations = get_shared_core_violations(cpu)
        log.info("shared core violations: {}".format(violations))
        self.assertEqual(0, len(violations))

        # Claim 1 thread on every core
        dummy_workload_id = uuid.uuid4()
        for p in cpu.get_packages():
            for c in p.get_cores():
                c.get_threads()[0].claim(dummy_workload_id)

        violations = get_shared_core_violations(cpu)
        log.info("shared core violations: {}".format(violations))
        self.assertEqual(0, len(violations))

        # Assign another workload which will force core sharing
        allocator = GreedyCpuAllocator()
        w = get_test_workload(uuid.uuid4(), 2, STATIC)
        workloads = {w.get_id(): w}
        request = AllocateThreadsRequest(cpu, w.get_id(), workloads, {},
                                         DEFAULT_TEST_REQUEST_METADATA)
        cpu = allocator.assign_threads(request).get_cpu()
        violations = get_shared_core_violations(cpu)
        log.info("shared core violations: {}".format(violations))
        self.assertEqual(2, len(violations))
Пример #21
0
    def __apply_isolation(self, response: AllocateResponse):
        last_w_responses = self.__get_workload_allocation_dict(self.__last_response)

        for w_alloc in response.get_workload_allocations():
            last_w_alloc = last_w_responses.get(w_alloc.get_workload_id(), None)
            if w_alloc == last_w_alloc:
                log.info("Skipping update of workload: {}".format(w_alloc.get_workload_id()))
                continue

            workload_id = w_alloc.get_workload_id()
            thread_ids = w_alloc.get_thread_ids()
            quota = w_alloc.get_cpu_quota()
            shares = w_alloc.get_cpu_shares()

            log.info("updating workload: '{}' cpuset: '{}', quota: '{}', shares: '{}'".format(
                workload_id, thread_ids, quota, shares))

            # This ordering is important for reporting whether a workload is isolated.
            # We must always set the "cpuset" first.
            self.__cgroup_manager.set_cpuset(workload_id, thread_ids)
            self.__cgroup_manager.set_quota(workload_id, quota)
            self.__cgroup_manager.set_shares(workload_id, shares)
    def __apply_isolation(self, response: AllocateResponse):
        last_w_responses = self.__get_workload_allocation_dict(
            self.__last_response)

        for w_alloc in response.get_workload_allocations():
            last_w_alloc = last_w_responses.get(w_alloc.get_workload_id(),
                                                None)
            if w_alloc == last_w_alloc:
                log.info("Skipping update of workload: {}".format(
                    w_alloc.get_workload_id()))
                continue

            workload_id = w_alloc.get_workload_id()
            thread_ids = w_alloc.get_thread_ids()
            quota = w_alloc.get_cpu_quota()
            shares = w_alloc.get_cpu_shares()
            memory_migrate = w_alloc.get_memory_migrate()
            memory_spread_page = w_alloc.get_memory_spread_page()
            memory_spread_slab = w_alloc.get_memory_spread_slab()

            log.info(f'updating workload: {workload_id} '
                     f'cpuset: {thread_ids}, '
                     f'quota: {quota}, '
                     f'shares: {shares}, '
                     f'memory_migrate: {memory_migrate}, '
                     f'memory_spread_page: {memory_spread_page}, '
                     f'memory_spread_slab: {memory_spread_slab}')

            # This ordering is important for reporting whether a workload is isolated.
            # We must always set the "cpuset" first.
            self.__cgroup_manager.set_cpuset(workload_id, thread_ids)
            self.__cgroup_manager.set_quota(workload_id, quota)
            self.__cgroup_manager.set_shares(workload_id, shares)
            self.__cgroup_manager.set_memory_migrate(workload_id,
                                                     memory_migrate)
            self.__cgroup_manager.set_memory_spread_page(
                workload_id, memory_spread_page)
            self.__cgroup_manager.set_memory_spread_slab(
                workload_id, memory_spread_slab)
    def __predict_usage(self, workloads, cpu_usage):
        res = {}
        cpu_usage_predictor = self.__get_cpu_usage_predictor()

        cm = self.__config_manager
        pred_env = PredEnvironment(cm.get_region(), cm.get_environment(), dt.utcnow().hour)

        start_time = time.time()
        for w in workloads.values():  # TODO: batch the call
            pred = cpu_usage_predictor.predict(w, cpu_usage.get(w.get_id(), None), pred_env)
            res[w.get_id()] = pred
        stop_time = time.time()
        self.__call_meta['pred_cpu_usage_dur_secs'] = stop_time - start_time
        try:
            self.__call_meta['pred_cpu_usage_model_id'] = cpu_usage_predictor.get_model().meta_data['model_training_titus_task_id']
        except:
            self.__call_meta['pred_cpu_usage_model_id'] = 'unknown'

        log.info("Usage prediction per workload: " + str(res))
        if len(res) > 0:
            self.__call_meta['pred_cpu_usage'] = dict(res)
        return res
    def test_crash_ip_allocator_metrics(self):

        cpu = get_cpu(2, 16, 2)
        test_context = TestContext(cpu=cpu)

        # now override the cpu seen by the allocator to crash it
        test_context.get_workload_manager().get_allocator().set_cpu(
            get_cpu(2, 2, 2))

        events = [get_container_create_event(10, name="foo", id="bar")]
        event_count = len(events)
        event_manager = EventManager(MockEventProvider(events),
                                     test_context.get_event_handlers(),
                                     get_mock_file_manager(), 5.0)

        wait_until(lambda: event_count == event_manager.get_processed_count())

        log.info("Event manager has processed {} events.".format(
            event_manager.get_processed_count()))

        workload_manager = test_context.get_workload_manager()
        registry = Registry()
        reporter = InternalMetricsReporter(workload_manager, event_manager)
        reporter.set_registry(registry)
        reporter.report_metrics({})

        wait_until(lambda: self.__gauge_value_equals(registry, RUNNING, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, ADDED_KEY, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, REMOVED_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, SUCCEEDED_KEY, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, FAILED_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, WORKLOAD_COUNT_KEY, 1))
        wait_until(lambda: self.__gauge_value_equals(
            registry, FALLBACK_ALLOCATOR_COUNT, 1))

        event_manager.stop_processing_events()
    def __init__(self, exit_handler: ExitHandler):
        self.__exit_handler = exit_handler
        self.__config_manager = get_config_manager()
        self.__node_name = self.__config_manager.get_str(EC2_INSTANCE_ID)

        kubeconfig = self.get_kubeconfig_path()
        self.__core_api = kubernetes.client.CoreV1Api(
            kubernetes.config.new_client_from_config(config_file=kubeconfig))
        self.__custom_api = kubernetes.client.CustomObjectsApi(
            kubernetes.config.new_client_from_config(config_file=kubeconfig))

        self.__lock = Lock()
        self.__opportunistic_resources = {}

        oversubscribe_frequency = self.__config_manager.get_float(
            OVERSUBSCRIBE_FREQUENCY_KEY, DEFAULT_OVERSUBSCRIBE_FREQUENCY)
        if oversubscribe_frequency > 0:
            watch_thread = Thread(target=self.__watch)
            watch_thread.start()
        else:
            log.info(
                "Skipping opportunistic resource watch, as opportunistic publishing is not configured."
            )
Пример #26
0
    def __init__(self,
                 config_manager,
                 exit_handler,
                 properties,
                 detection_interval=PROPERTY_CHANGE_DETECTION_INTERVAL_SEC):

        self.__config_manager = config_manager
        self.__exit_handler = exit_handler
        self.__properties = properties

        self.__original_properties = {}
        for p in properties:
            self.__original_properties[p] = config_manager.get_str(p)

        self.__original_primary_allocator_name =\
            get_fallback_allocator(config_manager).get_primary_allocator().__class__.__name__

        log.info("Starting watching for changes to properties: {}".format(
            properties))
        for k, v in self.__original_properties.items():
            log.info("{}: {}".format(k, v))

        schedule.every(detection_interval).seconds.do(self.detect_changes)
Пример #27
0
    def __init__(self, cpu, cgroup_manager,
            allocator_class=IntegerProgramCpuAllocator,
            fallback_allocator_class=GreedyCpuAllocator):
        self.__lock = Lock()

        self.__error_count = 0
        self.__added_count = 0
        self.__removed_count = 0
        self.__allocator_call_duration_sum_secs = 0
        self.__fallback_allocator_calls_count = 0
        self.__time_bound_ip_allocator_solution_count = 0

        self.__cpu = cpu
        self.__cgroup_manager = cgroup_manager
        self.__workloads = {}
        self.__cpu_allocator = allocator_class(cpu)
        self.__is_ip_allocator_used = False
        self.__fallback_cpu_allocator = None
        if isinstance(self.__cpu_allocator, IntegerProgramCpuAllocator):
            self.__is_ip_allocator_used = True
        if fallback_allocator_class is not None:
            self.__fallback_cpu_allocator = fallback_allocator_class(cpu)
        log.info("Created workload manager with allocator: '{}'".format(self.__cpu_allocator.__class__.__name__))
Пример #28
0
    def __init__(self, cpu: Cpu, cgroup_manager: CgroupManager, cpu_allocator: CpuAllocator):

        self.__reg = None
        self.__tags = None
        self.__lock = Lock()
        self.__instance_id = get_config_manager().get_str(EC2_INSTANCE_ID)

        self.__cpu_allocator = cpu_allocator

        self.__error_count = 0
        self.__added_count = 0
        self.__removed_count = 0
        self.__rebalanced_count = 0
        self.__workload_processing_duration_sec = 0
        self.__update_state_duration_sec = 0

        self.__cpu = cpu
        self.__cgroup_manager = cgroup_manager
        self.__wmm = get_workload_monitor_manager()
        self.__workloads = {}
        self.__last_response = None

        log.info("Created workload manager")
Пример #29
0
    def get_predictions(
        self, pods: List[V1Pod], resource_usage: GlobalResourceUsage
    ) -> Optional[ResourceUsagePredictions]:
        config_manager = get_config_manager()
        if config_manager is None:
            log.warning("Config manager not yet set.")
            return None

        running_pods = []
        for p in pods:
            if self.is_running(p):
                running_pods.append(p)
            else:
                log.info("Pod is not yet running: %s", p.metadata.name)

        client_crt = get_client_cert_path(config_manager)
        client_key = get_client_key_path(config_manager)
        if client_crt is None or client_key is None:
            log.error("Failed to generate credential paths")
            return None

        url = get_url(config_manager)
        if url is None:
            log.error("Unable to generate prediction service url")
            return None

        body = self.__get_body(running_pods, resource_usage)
        if body is None:
            log.error("Unable to generate a prediction request body")
            return None

        predictions = get_predictions(client_crt, client_key, url, body)
        if predictions is None:
            log.error("Failed to get predictions")
            return None

        return ResourceUsagePredictions(predictions)
    def __watch(self):
        label_selector = "{}={}".format(
            OPPORTUNISTIC_RESOURCE_NODE_NAME_LABEL_KEY, self.__node_name)
        log.info("Starting opportunistic resource watch...")
        stream = None
        try:
            stream = watch.Watch().stream(
                self.__custom_api.list_cluster_custom_object,
                group="titus.netflix.com",
                version="v1",
                plural="opportunistic-resources",
                label_selector=label_selector)

            for event in stream:
                log.info("Event: %s", event)
                if self.__is_expired_error(event):
                    raise Exception("Opportunistic resource expired")

                event_type = event['type']
                if event_type not in HANDLED_EVENTS:
                    log.warning("Ignoring unhandled event: %s", event)
                    continue

                event_metadata_name = event['object']['metadata']['name']
                with self.__lock:
                    if event_type == ADDED:
                        self.__opportunistic_resources[
                            event_metadata_name] = event
                    elif event_type == DELETED:
                        self.__opportunistic_resources.pop(
                            event_metadata_name, None)

        except Exception:
            if stream is not None:
                stream.close()
            log.exception("Watch of opportunistic resources failed")
            self.__exit_handler.exit(OPPORTUNISTIC_WATCH_FAILURE)