def test_simple_usage(self): w_id0 = str(uuid.uuid4()) w_id1 = str(uuid.uuid4()) expected_usage0 = [0.0, 1.0, 2.0] expected_usage1 = [3.0, 4.0, 5.0] raw_usage = { CPU_USAGE: { w_id0: expected_usage0, w_id1: expected_usage1 }, MEM_USAGE: { w_id0: expected_usage0, w_id1: expected_usage1 }, NET_RECV_USAGE: { w_id0: expected_usage0, w_id1: expected_usage1 }, NET_TRANS_USAGE: { w_id0: expected_usage0, w_id1: expected_usage1 }, DISK_USAGE: { w_id0: expected_usage0, w_id1: expected_usage1 } } ru = GlobalResourceUsage(raw_usage) def __assert_expected(expected_usage, w_id): self.assertEqual(expected_usage, ru.get_cpu_usage()[w_id]) self.assertEqual(expected_usage, ru.get_mem_usage()[w_id]) self.assertEqual(expected_usage, ru.get_net_trans_usage()[w_id]) self.assertEqual(expected_usage, ru.get_net_recv_usage()[w_id]) self.assertEqual(expected_usage, ru.get_disk_usage()[w_id]) def __assert_all_expected(expected_usage, all_usage): self.assertEqual(expected_usage, all_usage[CPU_USAGE]) self.assertEqual(expected_usage, all_usage[MEM_USAGE]) self.assertEqual(expected_usage, all_usage[NET_RECV_USAGE]) self.assertEqual(expected_usage, all_usage[NET_TRANS_USAGE]) self.assertEqual(expected_usage, all_usage[DISK_USAGE]) __assert_expected(expected_usage0, w_id0) __assert_expected(expected_usage1, w_id1) __assert_all_expected(expected_usage0, ru.get_all_usage_for_workload(w_id0)) __assert_all_expected(expected_usage1, ru.get_all_usage_for_workload(w_id1)) # serialize/deserialize and assert again serial_ru = ru.serialize() ru = deserialize_global_resource_usage(serial_ru) __assert_expected(expected_usage0, w_id0) __assert_expected(expected_usage1, w_id1) __assert_all_expected(expected_usage0, ru.get_all_usage_for_workload(w_id0)) __assert_all_expected(expected_usage1, ru.get_all_usage_for_workload(w_id1))
def get_resource_usage(self, workload_ids: List[str]) -> GlobalResourceUsage: try: global_usage = GlobalResourceUsage(self.__get_usage_dict(workload_ids)) log.debug("Got resource usage: %s", json.dumps(global_usage.serialize(), sort_keys=True, separators=(',', ':'))) return global_usage except Exception: log.error("failed to get resource usage, returning empty usage") with self.__metric_lock: self.__get_resource_usage_failure_count += 1 return GlobalResourceUsage({})
def get_cpu_predictions(self, workloads: List[Workload], resource_usage: GlobalResourceUsage) \ -> Optional[Dict[str, float]]: config_manager = get_config_manager() if config_manager is None: log.warning("Config manager is not yet set") return {} cpu_usage = resource_usage.get_cpu_usage() if cpu_usage is None: log.warning("No cpu usage") return {} pred_env = PredEnvironment(config_manager.get_region(), config_manager.get_environment(), datetime.utcnow().hour) predictions = {} for workload in workloads: workload_cpu_usage = cpu_usage.get(workload.get_id(), None) if workload_cpu_usage is None: log.warning("No CPU usage for workload: %s", workload.get_id()) continue workload_cpu_usage = [float(u) for u in workload_cpu_usage] pred_cpus = self.predict(workload, workload_cpu_usage, pred_env) predictions[workload.get_id()] = pred_cpus return predictions
def get_no_usage_rebalance_request(cpu: Cpu, workloads: List[Workload]): return AllocateRequest(cpu=cpu, workloads=__workloads_list_to_map(workloads), resource_usage=GlobalResourceUsage({}), cpu_usage={}, mem_usage={}, net_recv_usage={}, net_trans_usage={}, disk_usage={}, metadata=DEFAULT_TEST_REQUEST_METADATA)
def test_empty_usage(self): ru = GlobalResourceUsage({}) def __assert_empty(): self.assertIsNotNone(ru) self.assertEqual({}, ru.serialize()) self.assertEqual({}, ru.get_all_usage_for_workload("foo")) self.assertIsNone(ru.get_cpu_usage()) self.assertIsNone(ru.get_mem_usage()) self.assertIsNone(ru.get_net_recv_usage()) self.assertIsNone(ru.get_net_trans_usage()) self.assertIsNone(ru.get_disk_usage()) __assert_empty() # serialize/deserialize and assert again serial_du = ru.serialize() ru = deserialize_global_resource_usage(serial_du) __assert_empty()
def __get_job_body(self, pod: V1Pod, resource_usage: GlobalResourceUsage): return { "job_id": pod.metadata.name, "job_descriptor": get_job_descriptor(pod), "task_data": { "started_ts_ms": str(get_start_time(pod)), "past_usage": self.__translate_usage( resource_usage.get_all_usage_for_workload( pod.metadata.name)) } }
def __get_rebalance_request(self): pcp_usage = self.__wmm.get_pcp_usage() resource_usage = GlobalResourceUsage(pcp_usage) return AllocateRequest( cpu=self.get_cpu_copy(), workloads=self.get_workload_map_copy(), resource_usage=resource_usage, cpu_usage=pcp_usage.get(CPU_USAGE, {}), mem_usage=pcp_usage.get(MEM_USAGE, {}), net_recv_usage=pcp_usage.get(NET_RECV_USAGE, {}), net_trans_usage=pcp_usage.get(NET_TRANS_USAGE, {}), disk_usage=pcp_usage.get(DISK_USAGE, {}), metadata=self.__get_request_metadata("rebalance"))
def __get_threads_request(self, workload_id, workload_map, request_type): pcp_usage = self.__wmm.get_pcp_usage() resource_usage = GlobalResourceUsage(pcp_usage) return AllocateThreadsRequest( cpu=self.get_cpu_copy(), workload_id=workload_id, workloads=workload_map, resource_usage=resource_usage, cpu_usage=pcp_usage.get(CPU_USAGE, {}), mem_usage=pcp_usage.get(MEM_USAGE, {}), net_recv_usage=pcp_usage.get(NET_RECV_USAGE, {}), net_trans_usage=pcp_usage.get(NET_TRANS_USAGE, {}), disk_usage=pcp_usage.get(DISK_USAGE, {}), metadata=self.__get_request_metadata(request_type))
def __get_job_body(self, pod: V1Pod, resource_usage: GlobalResourceUsage): return { "job_id": pod.metadata.name, # Note that on v1 pods job_descriptor is None. # That is OK though, because the resource prediction service will fetch the job # descriptor itself in that case. "job_descriptor": get_job_descriptor(pod), "task_data": { "started_ts_ms": str(get_start_time(pod)), "past_usage": self.__translate_usage( resource_usage.get_all_usage_for_workload( pod.metadata.name)) } }
def __get_simple_cpu_predictions(self) -> Dict[str, float]: cpu_predictor = self.__cpu_usage_predictor_manager.get_cpu_predictor() if cpu_predictor is None: log.error("Failed to get cpu predictor") return {} workloads = self.workload_manager.get_workloads() resource_usage = GlobalResourceUsage( self.__workload_monitor_manager.get_pcp_usage()) log.info("Getting simple cpu predictions...") cpu_predictions = cpu_predictor.get_cpu_predictions( workloads, resource_usage) if cpu_predictions is None: log.error("Failed to get cpu predictions") return {} else: log.info("Got simple cpu predictions: %s", json.dumps(cpu_predictions)) return cpu_predictions
def get_predictions( self, pods: List[V1Pod], resource_usage: GlobalResourceUsage ) -> Optional[ResourceUsagePredictions]: config_manager = get_config_manager() if config_manager is None: log.warning("Config manager not yet set.") return None running_pods = [] for p in pods: if p.metadata.name in resource_usage.get_workload_ids(): running_pods.append(p) else: log.info("Pod is not yet running: %s", p.metadata.name) client_crt = get_client_cert_path(config_manager) client_key = get_client_key_path(config_manager) if client_crt is None or client_key is None: log.error("Failed to generate credential paths") return None url = get_url(config_manager) if url is None: log.error("Unable to generate prediction service url") return None body = self.__get_body(running_pods, resource_usage) if body is None: log.error("Unable to generate a prediction request body") return None predictions = get_predictions(client_crt, client_key, url, body) if predictions is None: log.error("Failed to get predictions") return None return ResourceUsagePredictions(predictions)
def get_resource_usage(self, workload_ids: List[str]) -> GlobalResourceUsage: return GlobalResourceUsage({})