def get_resource_usages(self) -> List[ResourceUsage]: usages_copy = self.__get_usages_copy() if usages_copy is None: log.warning("No usage snapshot") return [] else: return usages_copy
def report_cpu_event(request: AllocateRequest, response: AllocateResponse): event_log_manager = get_event_log_manager() if event_log_manager is None: log.warning("Event log manager is not set.") return event_log_manager.report_event(get_cpu_event(request, response))
def _handle(self, event): try: if not self.__relevant(event): return if not managers_are_initialized(): log.warning("Managers are not yet initialized") return None self.handling_event(event, 'oversubscribing workloads') with self.__window_lock: if datetime.utcnow() < self.__window_end_time: self.__skip_count += 1 self.handled_event( event, 'skipping oversubscribe - a window is currently active' ) return self.__publish_window(event) except Exception: self.__fail_count += 1 log.error( "Event handler: '{}' failed to handle event: '{}'".format( self.__class__.__name__, event))
def report_metrics(self, tags): if self.__registry is None: log.debug( "Not reporting metrics because there's no registry available yet." ) return wm = get_workload_manager() if wm is None: log.debug( "Not reporting metrics because there's no workload manager available yet." ) return pcp_usage = self.get_pcp_usage() if CPU_USAGE not in pcp_usage.keys(): log.warning("No CPU usage in PCP usage.") return usage = pcp_usage[CPU_USAGE] static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage) burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage) self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage) self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage)
def get_cpu_predictions( self, workloads: List[Workload], resource_usage: GlobalResourceUsage) -> Optional[Dict[str, float]]: pod_manager = get_pod_manager() if pod_manager is None: return None pods = [] for w in workloads: pod = pod_manager.get_pod(w.get_id()) if pod is None: log.warning("Failed to get pod for workload: %s", w.get_id()) else: pods.append(pod) resource_usage_predictions = self.get_predictions(pods, resource_usage) predictions = {} if resource_usage_predictions is None: log.error("Got no resource usage predictions") return predictions else: log.info("Got resource usage predictions: %s", json.dumps(resource_usage_predictions.raw)) for w_id, prediction in resource_usage_predictions.predictions.items(): predictions[w_id] = get_first_window_cpu_prediction(prediction) return predictions
def get_predictions( self, running_pods: List[V1Pod], resource_usage: GlobalResourceUsage ) -> Optional[ResourceUsagePredictions]: config_manager = get_config_manager() if config_manager is None: log.warning("Config manager not yet set.") return None client_crt = get_client_cert_path(config_manager) client_key = get_client_key_path(config_manager) if client_crt is None or client_key is None: log.error("Failed to generate credential paths") return None url = get_url(config_manager) if url is None: log.error("Unable to generate prediction service url") return None body = self.__get_body(running_pods, resource_usage) if body is None: log.error("Unable to generate a prediction request body") return None predictions = get_predictions(client_crt, client_key, url, body) if predictions is None: log.error("Failed to get predictions") return None return ResourceUsagePredictions(predictions)
def report_metrics(self, tags): if self.__registry is None: log.debug("Not reporting metrics because there's no registry available yet.") return wm = get_workload_manager() if wm is None: log.debug("Not reporting metrics because there's no workload manager available yet.") return workload_ids = wm.get_workload_map_copy().keys() usage_dict = self.__get_usage_dict(workload_ids) if CPU_USAGE not in usage_dict.keys(): log.warning("No CPU usage in usage: %s", usage_dict) return usage = usage_dict[CPU_USAGE] static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage) burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage) self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage) self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage) with self.__metric_lock: self.__registry.counter(GET_RESOURCE_USAGE_FAILURE, tags).increment(self.__get_resource_usage_failure_count) self.__get_resource_usage_failure_count = 0
def __get_simple_cpu_predictions(self) -> Dict[str, float]: cpu_predictor = self.__cpu_usage_predictor_manager.get_cpu_predictor() if cpu_predictor is None: log.error("Failed to get cpu predictor") return {} workloads = self.__workload_manager.get_workloads() if len(workloads) == 0: log.warning("No workloads, skipping cpu usage prediction") return {} workload_ids = [w.get_id() for w in workloads] resource_usage = self.__workload_monitor_manager.get_resource_usage( workload_ids) log.info("Getting simple cpu predictions...") cpu_predictions = cpu_predictor.get_cpu_predictions( workloads, resource_usage) if cpu_predictions is None: log.error("Failed to get cpu predictions") return {} else: log.info("Got simple cpu predictions: %s", json.dumps(cpu_predictions)) return cpu_predictions
def get_cpu_predictions( self, workloads: List[Workload], resource_usage: GlobalResourceUsage) -> Optional[Dict[str, float]]: pods = [] for w in workloads: if w.get_object_type() is not KubernetesWorkload: log.warning( "Cannot predict non Kubernetes workload %s: %s is not %s", w.get_id(), w.get_object_type(), KubernetesWorkload) continue pods.append(w.get_pod()) resource_usage_predictions = self.get_predictions(pods, resource_usage) predictions = {} if resource_usage_predictions is None: log.error("Got no resource usage predictions") return predictions else: log.info("Got resource usage predictions: %s", json.dumps(resource_usage_predictions.raw)) for w_id, prediction in resource_usage_predictions.predictions.items(): predictions[w_id] = get_first_window_cpu_prediction(prediction) return predictions
def __has_required_labels(container): for l in REQUIRED_LABELS: if l not in container.labels: log.warning( "Found running workload: '{}' without expected label: '{}'". format(container.name, l)) return False return True
def __get_usage_path(self): if self.__usage_path is not None: return self.__usage_path try: self.__usage_path = get_usage_all_path(self.__workload.get_id()) except FileNotFoundError: log.warning("No cpu usage path for workload: '{}'".format( self.__workload.get_id())) return self.__usage_path
def __get_address(self) -> Optional[str]: config_manager = get_config_manager() region = config_manager.get_region() env = config_manager.get_environment() format_str = config_manager.get_str(EVENT_LOG_FORMAT_STR) if format_str is None: log.warning("Keystone is not enabled in this region env: %s %s", region, env) return None stream = 'titus_isolate' return format_str.format(region, env, stream)
def get_grpc_cell_name(config_manager): endpoint = config_manager.get_cached_str(GRPC_REMOTE_ALLOC_ENDPOINT, None) if endpoint is None: log.warning("Could not get grpc remote allocator endpoint address.") return UNKNOWN_CELL stub = IsolationServiceStub(grpc.insecure_channel(endpoint)) res = stub.GetCurrentCell(CurrentCellRequest(), timeout=5.0) if res.cell_id == "": log.warning("Service returned empty grpc cell header") return UNKNOWN_CELL return res.cell_id
def get_http_cell_name(config_manager): url = config_manager.get_cached_str(REMOTE_ALLOCATOR_URL) if url is None: log.warning("No remote solver URL specified.") return UNKNOWN_CELL timeout = config_manager.get_cached_int(MAX_SOLVER_RUNTIME, DEFAULT_MAX_SOLVER_RUNTIME) response = requests.get(url, timeout=timeout) cell_name = response.headers.get(TITUS_ISOLATE_CELL_HEADER, None) if cell_name is None: log.warning("Titus isolation cell header is not set.") return UNKNOWN_CELL return cell_name
def get_cpu_usage(self): usage_path = self.__get_usage_path() if usage_path is None: return None if not os.path.isfile(usage_path): log.warning("cpu usage path does not exist: {}".format(usage_path)) return with open(usage_path, 'r') as f: timestamp = datetime.datetime.utcnow() content = f.read() cpu_usage_rows = parse_cpuacct_usage_all(content) return CpuUsageSnapshot(timestamp, cpu_usage_rows)
def __init__(self, raw: dict): self.raw = raw self.model_version = raw.get(MODEL_VERSION, "UNKNOWN_MODEL_VERSION") self.model_instance_id = raw.get(MODEL_INSTANCE_ID, "UNKNOWN_MODEL_INSTANCE_ID") self.prediction_ts_ms = int(raw.get(PREDICTION_TS_MS, '0')) self.metadata = raw.get(META_DATA, {}) self.predictions = {} raw_predictions = raw.get(PREDICTIONS) if raw_predictions is not None: for p in raw_predictions: job_id = p.get(JOB_ID, "UNKNOWN_JOB_ID") self.predictions[job_id] = ResourceUsagePrediction(p) else: log.warning("No predictions present")
def get_free_threads(self, cpu: Cpu, workload_map: Dict[str, Workload], cpu_usage: Dict[str, float] = None) -> List[Thread]: if cpu_usage is None: log.warning( "CPU usage is required, defaulting to EMPTY threads being free." ) return cpu.get_empty_threads() free_threads = [] for c in get_free_cores(self.__threshold, cpu, workload_map, cpu_usage): free_threads += c.get_threads() return free_threads
def __validate_prom_response(resp) -> bool: if "data" not in resp: log.error( "Unexpected Prometheus response. No 'data' field in response") return False data = resp["data"] if "result" not in data: log.error( "Unexpected Prometheus response. No 'result' field in data") return False result = data["result"] if len(result) == 0: log.warning("Empty result returned by Prometheus") return False return True
def rebalance(self, request: AllocateRequest) -> AllocateResponse: self.__call_meta = {} cpu = request.get_cpu() cpu_usage = request.get_cpu_usage() workloads = request.get_workloads() self.__cnt_rebalance_calls += 1 if len(workloads) == 0: log.warning("Ignoring rebalance of empty CPU.") self.__call_meta['rebalance_empty'] = 1 return AllocateResponse(cpu, self.get_name(), self.__call_meta) log.info("Rebalancing with predictions...") curr_ids_per_workload = cpu.get_workload_ids_to_thread_ids() return AllocateResponse( self.__compute_allocation(cpu, None, workloads, curr_ids_per_workload, cpu_usage, None), self.get_name(), self.__call_meta)
def get_current_workloads(docker_client): workloads = [] for container in docker_client.containers.list(): workload_id = container.name if __has_required_labels(container): try: labels = container.labels cpu = int(__get_value(labels, CPU_LABEL_KEY, -1)) mem = int(__get_value(labels, MEM_LABEL_KEY, -1)) disk = int(__get_value(labels, DISK_LABEL_KEY, -1)) network = int(__get_value(labels, DISK_LABEL_KEY, -1)) app_name = __get_value(labels, APP_NAME_LABEL_KEY) owner_email = __get_value(labels, OWNER_EMAIL_LABEL_KEY) command = __get_value(labels, COMMAND_LABEL_KEY) entrypoint = __get_value(labels, ENTRYPOINT_LABEL_KEY) job_type = __get_value(labels, JOB_TYPE_LABEL_KEY) workload_type = __get_value(labels, WORKLOAD_TYPE_LABEL_KEY) image = __get_image(container) workloads.append( Workload(identifier=workload_id, thread_count=cpu, mem=mem, disk=disk, network=network, app_name=app_name, owner_email=owner_email, image=image, command=command, entrypoint=entrypoint, job_type=job_type, workload_type=workload_type)) log.info("Found running workload: '{}'".format(workload_id)) except: log.exception( "Failed to parse labels for container: '{}'".format( container.name)) else: log.warning( "Found running workload: '{}' without expected labels'") return workloads
def get_current_workloads(docker_client): workloads = [] for container in docker_client.containers.list(): workload_id = container.name if __has_required_labels(container): try: cpu = int(container.labels[CPU_LABEL_KEY]) workload_type = container.labels[WORKLOAD_TYPE_LABEL_KEY] workloads.append(Workload(workload_id, cpu, workload_type)) log.info("Found running workload: '{}'".format(workload_id)) except: log.exception( "Failed to parse labels for container: '{}'".format( container.name)) else: log.warning( "Found running workload: '{}' without expected label: '{}'". format(workload_id, CPU_LABEL_KEY)) return workloads
def __process_events(self): while True: try: msg = self.__q.get() if not self.__enabled: log.warning("Dropping keystone event because keystone is disabled") continue log.debug("Sending event log message: {}".format(msg)) response = send_event_msg(msg, self.__address) if response.status_code != 200: log.error("Re-enqueuing failed event log message: {}".format(response.content)) self.__retry_msg_count += 1 self.__q.put_nowait(msg) else: self.__succeeded_msg_count += 1 except Exception: self.__failed_msg_count += 1 log.error("Failed to process event log message.")
def get_cell_name(): config_manager = get_config_manager() if config_manager is None: log.warning("Config manager is not yet set.") return UNKNOWN_CELL if not is_primary_allocator_grpc(config_manager): log.info("Fetching cell for an http allocator") fetch_fun = get_http_cell_name else: log.info("Fetching cell for the grpc allocator") fetch_fun = get_grpc_cell_name try: cell_name = fetch_fun(config_manager) log.info("Cell: %s", cell_name) return cell_name except Exception: log.error("Failed to determine isolation cell.") return UNKNOWN_CELL
def get_cpu_predictions(self, workloads: List[Workload], resource_usage: GlobalResourceUsage) \ -> Optional[Dict[str, float]]: config_manager = get_config_manager() if config_manager is None: log.warning("Config manager is not yet set") return {} cpu_usage = resource_usage.get_cpu_usage() if cpu_usage is None: log.warning("No cpu usage") return {} pred_env = PredEnvironment(config_manager.get_region(), config_manager.get_environment(), datetime.utcnow().hour) predictions = {} for workload in workloads: workload_cpu_usage = cpu_usage.get(workload.get_id(), None) if workload_cpu_usage is None: log.warning("No CPU usage for workload: %s", workload.get_id()) continue workload_cpu_usage = [float(u) for u in workload_cpu_usage] pred_cpus = self.predict(workload, workload_cpu_usage, pred_env) predictions[workload.get_id()] = pred_cpus return predictions
def get_cell_name(): config_manager = get_config_manager() if config_manager is None: log.warning("Config manager is not yet set.") return UNKNOWN_CELL url = config_manager.get_str(REMOTE_ALLOCATOR_URL) if url is None: log.warning("No remote solver URL specified.") return UNKNOWN_CELL timeout = config_manager.get_int(MAX_SOLVER_RUNTIME, DEFAULT_MAX_SOLVER_RUNTIME) try: response = requests.get(url, timeout=timeout) cell_name = response.headers.get(TITUS_ISOLATE_CELL_HEADER, None) if cell_name is None: log.warning("Titus isolation cell header is not set.") return UNKNOWN_CELL else: return cell_name except: log.exception("Failed to determine isolation cell.") return UNKNOWN_CELL
def __watch(self): label_selector = "{}={}".format( OPPORTUNISTIC_RESOURCE_NODE_NAME_LABEL_KEY, self.__node_name) log.info("Starting opportunistic resource watch...") stream = None try: stream = watch.Watch().stream( self.__custom_api.list_cluster_custom_object, group="titus.netflix.com", version="v1", plural="opportunistic-resources", label_selector=label_selector) for event in stream: log.info("Event: %s", event) if self.__is_expired_error(event): raise Exception("Opportunistic resource expired") event_type = event['type'] if event_type not in HANDLED_EVENTS: log.warning("Ignoring unhandled event: %s", event) continue event_metadata_name = event['object']['metadata']['name'] with self.__lock: if event_type == ADDED: self.__opportunistic_resources[ event_metadata_name] = event elif event_type == DELETED: self.__opportunistic_resources.pop( event_metadata_name, None) except Exception: if stream is not None: stream.close() log.exception("Watch of opportunistic resources failed") self.__exit_handler.exit(OPPORTUNISTIC_WATCH_FAILURE)
def get_predictions( self, pods: List[V1Pod], resource_usage: GlobalResourceUsage ) -> Optional[ResourceUsagePredictions]: config_manager = get_config_manager() if config_manager is None: log.warning("Config manager not yet set.") return None running_pods = [] for p in pods: if p.metadata.name in resource_usage.get_workload_ids(): running_pods.append(p) else: log.info("Pod is not yet running: %s", p.metadata.name) client_crt = get_client_cert_path(config_manager) client_key = get_client_key_path(config_manager) if client_crt is None or client_key is None: log.error("Failed to generate credential paths") return None url = get_url(config_manager) if url is None: log.error("Unable to generate prediction service url") return None body = self.__get_body(running_pods, resource_usage) if body is None: log.error("Unable to generate a prediction request body") return None predictions = get_predictions(client_crt, client_key, url, body) if predictions is None: log.error("Failed to get predictions") return None return ResourceUsagePredictions(predictions)
def reconcile(self, cpu: Cpu): if self.__cgroup_manager.has_pending_work(): log.warning( "Skipping reconciliation as some isolation work is still pending." ) self.__skip_count += 1 return workloads = self.get_workloads(cpu) for w_id, t_ids in workloads.items(): cpuset = sorted(self.__cgroup_manager.get_cpuset(w_id)) t_ids = sorted(t_ids) if cpuset != t_ids: log.error( "Reconciliation has failed for workload: '{}', cpuset: {} != t_ids: {}" .format(w_id, cpuset, t_ids)) self.__exit_handler.exit(RECONCILIATION_FAILURE_EXIT) else: log.info( "Reconciliation has succeeded for workload: '{}', cpuset: {} == t_ids: {}" .format(w_id, cpuset, t_ids)) self.__success_count += 1
def __publish_window(self, event): # we calculate the window before we send the request to ensure we're not going over our 10 minute mark start = datetime.utcnow() end = start + timedelta(minutes=self.__config_manager.get_int( OVERSUBSCRIBE_WINDOW_SIZE_MINUTES_KEY, DEFAULT_OVERSUBSCRIBE_WINDOW_SIZE_MINUTES)) simple_cpu_usage_predictions = self.__get_simple_cpu_predictions() workload_count = 0 underutilized_cpu_count = 0 for workload in self.__workload_manager.get_workloads(): log.info('workload:%s job_type:%s cpu:%d', workload.get_app_name(), workload.get_job_type(), workload.get_thread_count()) if not self.__is_long_enough(workload): continue simple_cpu_prediction = simple_cpu_usage_predictions.get( workload.get_id(), None) if simple_cpu_prediction is None: log.warning("No CPU prediction for workload: %s", workload.get_id()) continue # Process prediction pred_usage = simple_cpu_prediction / workload.get_thread_count() threshold = self.__config_manager.get_float( TOTAL_THRESHOLD, DEFAULT_TOTAL_THRESHOLD) log.info( "Testing oversubscribability of workload: {}, threshold: {}, prediction: {}" .format(workload.get_id(), threshold, pred_usage)) if pred_usage > threshold: log.info("Workload: %s is NOT oversubscribable: %s", workload.get_id(), pred_usage) continue log.info("Workload: %s is oversubscribable: %s", workload.get_id(), pred_usage) if workload.is_opportunistic(): # only add the number of "real" threads (non-opportunistic) free = workload.get_thread_count( ) - workload.get_opportunistic_thread_count() if free <= 0: continue underutilized_cpu_count += free else: underutilized_cpu_count += workload.get_thread_count() workload_count += 1 free_cpu_count = underutilized_cpu_count if free_cpu_count > 0: self.__window_publisher.add_window(start, end, free_cpu_count) self.__window_end_time = end self.__success_count += 1 self.handled_event( event, 'oversubscribed {} cpus from {} workloads, {} total cpus are oversubscribed' .format(free_cpu_count, workload_count, underutilized_cpu_count))
def publish(self): log.info("Predicting resource usage") allocated_resources = Resources() num_batch_containers = 0 num_service_containers = 0 if len(self.__pod_manager.get_pods()) == 0: log.warning("No pods, skipping resource usage prediction") predictions = ResourceUsagePredictions({}) predictions.set_prediction_ts_ms( 1000 * int(time.mktime(dt.utcnow().timetuple()))) else: running_pods = [ p for p in self.__pod_manager.get_pods() if self.__resource_usage_predictor.is_running(p) ] try: allocated_resources = self.__compute_allocated_resources( running_pods) except Exception as e: self.__parse_pod_req_resources_fail_count += 1 log.error( "Failed to parse pod requested resources. Aborting: %s", e) raise e workload_ids = [p.metadata.name for p in running_pods] predictions = self.__resource_usage_predictor.get_predictions( running_pods, self.__wmm.get_resource_usage(workload_ids)) num_batch_containers, num_service_containers = self.__compute_num_containers( running_pods) node = get_node() log.debug('owner_kind:%s owner_name:%s owner_uid:%s', node.kind, node.metadata.name, node.metadata.uid) instance_type = get_instance_type(node) condensed_predictions = CondensedResourceUsagePrediction( predictions, allocated_resources, instance_type, num_batch_containers, num_service_containers, self.__resources_capacity) object_name = "{}".format(node.metadata.name) metadata = V1ObjectMeta( namespace=PREDICTED_RESOURCE_USAGE_NAMESPACE, name=object_name, labels={ PREDICTED_RESOURCE_USAGE_NODE_NAME_LABEL_KEY: node.metadata.name, PREDICTED_RESOURCE_USAGE_NODE_UID_LABEL_KEY: node.metadata.uid }, owner_references=[ V1OwnerReference(api_version=node.api_version, kind=node.kind, name=node.metadata.name, uid=node.metadata.uid) ]) body = ResourceUsagePredictionsResource(metadata=metadata, spec=condensed_predictions) obj = "UNINITIALIZED_RESOURCE_PREDICTION_OBJECT" try: obj = self.__custom_api.patch_namespaced_custom_object( version=PREDICTED_USAGE_RESOURCE_VERSION, group=CUSTOM_RESOURCE_GROUP, plural=PREDICTED_RESOURCE_USAGE_PLURAL, namespace=PREDICTED_RESOURCE_USAGE_NAMESPACE, name=object_name, body=body) except ApiException as e: log.info("ApiException status: %s", e.status) if e.status == 404: obj = self.__custom_api.create_namespaced_custom_object( version=PREDICTED_USAGE_RESOURCE_VERSION, group=CUSTOM_RESOURCE_GROUP, plural=PREDICTED_RESOURCE_USAGE_PLURAL, namespace=PREDICTED_RESOURCE_USAGE_NAMESPACE, body=body) else: log.error("Encountered unexpected API exception reason") log.info('predicted resource usage: %s', json.dumps(obj))