def __wait_for_files(container_name): cgroup_file_wait_timeout = get_config_manager().get_float( WAIT_CGROUP_FILE_KEY, DEFAULT_WAIT_CGROUP_FILE_SEC) json_file_wait_timeout = get_config_manager().get_float( WAIT_JSON_FILE_KEY, DEFAULT_WAIT_JSON_FILE_SEC) wait_for_files(container_name, cgroup_file_wait_timeout, json_file_wait_timeout)
def get_workload_from_kubernetes(identifier) -> Optional[KubernetesWorkload]: if not managers_are_initialized(): log.error( "Cannot get workload from kubernetes because managers aren't initialized" ) return None retry_count = get_config_manager().get_int( GET_WORKLOAD_RETRY_COUNT, DEFAULT_GET_WORKLOAD_RETRY_COUNT) retry_interval = get_config_manager().get_float( GET_WORKLOAD_RETRY_INTERVAL_SEC, DEFAULT_GET_WORKLOAD_RETRY_INTERVAL_SEC) pod_manager = get_pod_manager() for i in range(retry_count): log.info("Getting pod from kubernetes: %s", identifier) pod = pod_manager.get_pod(identifier) if pod is not None: log.info("Got pod from kubernetes: %s", identifier) return KubernetesWorkload(pod) log.info("Retrying getting pod from kubernetes in %s seconds", retry_interval) time.sleep(retry_interval) log.error("Failed to get pod from kubernetes: %s", identifier) return None
def get_cpu_model_prefix_name(): config_manager = get_config_manager() prefix = config_manager.get_str(MODEL_BUCKET_PREFIX, DEFAULT_MODEL_BUCKET_PREFIX) leaf = config_manager.get_str(MODEL_BUCKET_LEAF, DEFAULT_MODEL_BUCKET_LEAF) format_str = get_config_manager().get_str(MODEL_PREFIX_FORMAT_STR) if format_str is None: return None return format_str.format(prefix, leaf)
def get_cpu_predictions(self, workloads: List[Workload], resource_usage: GlobalResourceUsage) \ -> Optional[Dict[str, float]]: config_manager = get_config_manager() if config_manager is None: log.warning("Config manager is not yet set") return {} cpu_usage = resource_usage.get_cpu_usage() if cpu_usage is None: log.warning("No cpu usage") return {} pred_env = PredEnvironment(config_manager.get_region(), config_manager.get_environment(), datetime.utcnow().hour) predictions = {} for workload in workloads: workload_cpu_usage = cpu_usage.get(workload.get_id(), None) if workload_cpu_usage is None: log.warning("No CPU usage for workload: %s", workload.get_id()) continue workload_cpu_usage = [float(u) for u in workload_cpu_usage] pred_cpus = self.predict(workload, workload_cpu_usage, pred_env) predictions[workload.get_id()] = pred_cpus return predictions
def get_workload_response(workload: Workload, cpu: Cpu) -> Optional[WorkloadAllocateResponse]: thread_ids = get_threads(cpu, workload.get_id()) cpu_shares = get_cpu_shares(workload) cpu_quota = get_cpu_quota(workload) if len(thread_ids) < 1: return None memory_migrate = DEFAULT_TITUS_ISOLATE_MEMORY_MIGRATE memory_spread_page = DEFAULT_TITUS_ISOLATE_MEMORY_SPREAD_PAGE memory_spread_slab = DEFAULT_TITUS_ISOLATE_MEMORY_SPREAD_SLAB config_manager = get_config_manager() if config_manager is not None: memory_migrate = config_manager.get_cached_bool( TITUS_ISOLATE_MEMORY_MIGRATE, DEFAULT_TITUS_ISOLATE_MEMORY_MIGRATE) memory_spread_page = config_manager.get_cached_bool( TITUS_ISOLATE_MEMORY_SPREAD_PAGE, DEFAULT_TITUS_ISOLATE_MEMORY_SPREAD_PAGE) memory_spread_slab = config_manager.get_cached_bool( TITUS_ISOLATE_MEMORY_SPREAD_SLAB, DEFAULT_TITUS_ISOLATE_MEMORY_SPREAD_SLAB) return WorkloadAllocateResponse(workload_id=workload.get_id(), thread_ids=thread_ids, cpu_shares=cpu_shares, cpu_quota=cpu_quota, memory_migrate=memory_migrate, memory_spread_page=memory_spread_page, memory_spread_slab=memory_spread_slab)
def get_required_property(key): value = get_config_manager().get_str(key) if value is None: log.error("Failed to retrieve property: '{}'".format(key)) return None return value
def __init__(self, primary_cpu_allocator: CpuAllocator, secondary_cpu_allocator: CpuAllocator): if primary_cpu_allocator is None: raise ValueError("Must be provided a primary cpu allocator.") if secondary_cpu_allocator is None: raise ValueError("Must be provided a secondary cpu allocator.") self.__reg = None self.__primary_allocator = primary_cpu_allocator self.__secondary_allocator = secondary_cpu_allocator self.__primary_assign_threads_call_count = 0 self.__primary_free_threads_call_count = 0 self.__primary_rebalance_call_count = 0 self.__secondary_assign_threads_call_count = 0 self.__secondary_free_threads_call_count = 0 self.__secondary_rebalance_call_count = 0 self.__queue_depth_fallback_count = 0 cm = get_config_manager() self.__fallback_queue_depth = cm.get_cached_int(FALLBACK_QUEUE_DEPTH, DEFAULT_FALLBACK_QUEUE_DEPTH) log.info( "Created FallbackCpuAllocator with primary cpu allocator: '{}' and secondary cpu allocator: '{}', fallback queue depth: '{}'".format( self.__primary_allocator.__class__.__name__, self.__secondary_allocator.__class__.__name__, self.__fallback_queue_depth))
def __init__(self, event_iterable, event_handlers, event_timeout=DEFAULT_EVENT_TIMEOUT_SECS): self.__reg = None self.__stopped = False self.__q = Queue() self.__events = event_iterable self.__event_handlers = event_handlers self.__event_timeout = event_timeout self.__success_event_count = 0 self.__error_event_count = 0 self.__processed_event_count = 0 self.__started = False self.__started_lock = Lock() self.__processing_thread = Thread(target=self.__process_events) self.__pulling_thread = Thread(target=self.__pull_events) config_manager = get_config_manager() rebalance_frequency = config_manager.get_float( REBALANCE_FREQUENCY_KEY, DEFAULT_REBALANCE_FREQUENCY) if rebalance_frequency > 0: schedule.every(rebalance_frequency).seconds.do(self.__rebalance) reconcile_frequency = config_manager.get_float( RECONCILE_FREQUENCY_KEY, DEFAULT_RECONCILE_FREQUENCY) if reconcile_frequency > 0: schedule.every(reconcile_frequency).seconds.do(self.__reconcile)
def get_cpu_shares(workload: Workload) -> int: if workload.is_opportunistic(): opportunistic_shares_scale = get_config_manager().get_int( OPPORTUNISTIC_SHARES_SCALE_KEY, DEFAULT_OPPORTUNISTIC_SHARES_SCALE) return workload.get_thread_count() * opportunistic_shares_scale return workload.get_thread_count() * DEFAULT_SHARES_SCALE
def get_prom_url() -> str: cm = get_config_manager() # e.g. titusprometheus.us-east-1.staging01cell001.test.netflix.net default_host = f'titusprometheus.{cm.get_region()}.{cm.get_stack()}.{cm.get_environment()}.netflix.net' host = cm.get_cached_str(PROMETHEUS_HOST_OVERRIDE, default_host) return f'http://{host}/api/v1/query_range'
def get_predictions( self, running_pods: List[V1Pod], resource_usage: GlobalResourceUsage ) -> Optional[ResourceUsagePredictions]: config_manager = get_config_manager() if config_manager is None: log.warning("Config manager not yet set.") return None client_crt = get_client_cert_path(config_manager) client_key = get_client_key_path(config_manager) if client_crt is None or client_key is None: log.error("Failed to generate credential paths") return None url = get_url(config_manager) if url is None: log.error("Unable to generate prediction service url") return None body = self.__get_body(running_pods, resource_usage) if body is None: log.error("Unable to generate a prediction request body") return None predictions = get_predictions(client_crt, client_key, url, body) if predictions is None: log.error("Failed to get predictions") return None return ResourceUsagePredictions(predictions)
def __get_request_metadata(self, request_type) -> dict: config_manager = get_config_manager() return { "type": request_type, "instance_id": self.__instance_id, "region": config_manager.get_region(), "environment": config_manager.get_environment() }
def __update_local_model(self): cpu_predictor = get_config_manager().get_str(CPU_PREDICTOR, DEFAULT_CPU_PREDICTOR) if cpu_predictor == LEGACY_CPU_PREDICTOR: download_latest_cpu_model() with self.__lock: self.__cpu_usage_predictor = CpuUsagePredictor(get_cpu_model_file_path()) else: log.info("Skipping model update. CPU predictor: %s", cpu_predictor)
def __init__(self, free_thread_provider=EmptyFreeThreadProvider()): self.__reg = None self.__cache = {} self.__time_bound_call_count = 0 self.__solver_max_runtime_secs = get_config_manager().get_float( MAX_SOLVER_RUNTIME, DEFAULT_MAX_SOLVER_RUNTIME) self.__free_thread_provider = free_thread_provider
def __init__(self, free_thread_provider): config_manager = get_config_manager() self.__url = config_manager.get_str(REMOTE_ALLOCATOR_URL, "http://localhost:7501") solver_max_runtime_secs = config_manager.get_float(MAX_SOLVER_RUNTIME, DEFAULT_MAX_SOLVER_RUNTIME) solver_max_connect_secs = config_manager.get_float(MAX_SOLVER_CONNECT_SEC, DEFAULT_MAX_SOLVER_CONNECT_SEC) self.__timeout = (solver_max_connect_secs, solver_max_runtime_secs) self.__headers = {'Content-Type': "application/json"} self.__reg = None
def __init__(self, exit_handler: ExitHandler): self.__exit_handler = exit_handler self.__config_manager = get_config_manager() self.__registry = None self.__oppo = None self.__custom_api = kubernetes.client.CustomObjectsApi( kubernetes.config.new_client_from_config( config_file=DEFAULT_KUBECONFIG_PATH))
def __set_address(self): config_manager = get_config_manager() region = config_manager.get_region() env = config_manager.get_environment() format_str = config_manager.get_str(EVENT_LOG_FORMAT_STR) stream = 'titus_isolate' self.__address = format_str.format(region, env, stream) log.info("Set keystone address to: {}".format(self.__address))
def __init__(self): self.__config_manager = get_config_manager() self.__node_name = self.__config_manager.get_str(EC2_INSTANCE_ID) kubeconfig = self.get_kubeconfig_path() self.__core_api = kubernetes.client.CoreV1Api( kubernetes.config.new_client_from_config(config_file=kubeconfig)) # NOTE[jigish]: This API depends on the OpportunisticResource CRD. See the readme for how to create it. self.__custom_api = kubernetes.client.CustomObjectsApi( kubernetes.config.new_client_from_config(config_file=kubeconfig))
def get_cpu_model_bucket_name(): format_str = get_required_property(MODEL_BUCKET_FORMAT_STR) if format_str is None: return None config_manager = get_config_manager() region = config_manager.get_region() env = config_manager.get_environment() return format_str.format(region, env)
def __get_tags(): ec2_instance_id = 'EC2_INSTANCE_ID' tags = {} if ec2_instance_id in os.environ: tags["node"] = os.environ[ec2_instance_id] allocator_name = get_allocator_class(get_config_manager()).__name__ tags["cpu_allocator"] = allocator_name return tags
def __get_address(self) -> Optional[str]: config_manager = get_config_manager() region = config_manager.get_region() env = config_manager.get_environment() format_str = config_manager.get_str(EVENT_LOG_FORMAT_STR) if format_str is None: log.warning("Keystone is not enabled in this region env: %s %s", region, env) return None stream = 'titus_isolate' return format_str.format(region, env, stream)
def get_cpu_predictor(self) -> Optional[SimpleCpuPredictor]: config_manager = get_config_manager() cpu_predictor = config_manager.get_str(CPU_PREDICTOR, DEFAULT_CPU_PREDICTOR) log.info("Using cpu predictor: %s", cpu_predictor) if cpu_predictor == SERVICE_CPU_PREDICTOR: return self.__resource_usage_predictor if cpu_predictor == LEGACY_CPU_PREDICTOR: with self.__lock: return self.__cpu_usage_predictor return None
def __watch(self): while True: try: instance_id = get_config_manager().get_str("EC2_INSTANCE_ID") field_selector = "spec.nodeName={}".format(instance_id) log.info("Watching pods with field selector: %s", field_selector) v1 = client.CoreV1Api() w = watch.Watch() for event in w.stream(v1.list_pod_for_all_namespaces, field_selector=field_selector): self.__handle_event(event) except: log.exception("pod watch thread failed")
def __init__(self, workload_manager: WorkloadManager, window_publisher: OpportunisticWindowPublisher): super().__init__(workload_manager) self.__window_publisher = window_publisher self.__reg = None self.__fail_count = 0 self.__skip_count = 0 self.__success_count = 0 self.__reclaimed_cpu_count = None self.__config_manager = get_config_manager() self.__workload_monitor_manager = get_workload_monitor_manager() self.__cpu_usage_predictor_manager = get_cpu_usage_predictor_manager() self.__node_name = self.__config_manager.get_str(EC2_INSTANCE_ID)
def update_numa_balancing(workload: Workload, cpu: Cpu): try: config_manager = get_config_manager() dynamic_numa_balancing_enabled = config_manager.get_bool( TITUS_ISOLATE_DYNAMIC_NUMA_BALANCING, DEFAULT_TITUS_ISOLATE_DYNAMIC_NUMA_BALANCING) if not dynamic_numa_balancing_enabled: enable_numa_balancing() return if _occupies_entire_cpu(workload, cpu): disable_numa_balancing() else: enable_numa_balancing() except Exception: log.error("Failed to update NUMA balancing.")
def isolate_workload(workload_id, timeout=None): if timeout is None: timeout = get_config_manager().get_float( TITUS_ISOLATE_BLOCK_SEC, DEFAULT_TITUS_ISOLATE_BLOCK_SEC) deadline = time.time() + timeout while time.time() < deadline: if get_workload_manager().is_isolated(workload_id): return json.dumps({'workload_id': workload_id}), 200, { 'ContentType': 'application/json' } time.sleep(0.1) log.error("Failed to isolate workload: '{}'".format(workload_id)) return json.dumps({'unknown_workload_id': workload_id}), 404, { 'ContentType': 'application/json' }
def __init__(self, event_iterable, event_handlers, event_timeout=DEFAULT_EVENT_TIMEOUT_SECS): self.__reg = None self.__tags = None self.__stopped = False self.__q = Queue() self.__events = event_iterable self.__event_handlers = event_handlers self.__event_timeout = event_timeout self.__processed_count = 0 self.__started = False self.__started_lock = Lock() self.__processing_thread = Thread(target=self.__process_events) self.__pulling_thread = Thread(target=self.__pull_events) self.last_successful_event_epoch_s = 0 config_manager = get_config_manager() # Every instance of titus-isolate getting restarted at once produces scheduling spikes of events like # rebalance random_jitter = randrange(10) # 0-9 inclusive rebalance_frequency = config_manager.get_float(REBALANCE_FREQUENCY_KEY, DEFAULT_REBALANCE_FREQUENCY) if rebalance_frequency > 0: schedule.every(rebalance_frequency + random_jitter).seconds.do(self.__rebalance) reconcile_frequency = config_manager.get_float(RECONCILE_FREQUENCY_KEY, DEFAULT_RECONCILE_FREQUENCY) if reconcile_frequency > 0: schedule.every(reconcile_frequency + random_jitter).seconds.do(self.__reconcile) oversubscribe_frequency = config_manager.get_float(OVERSUBSCRIBE_FREQUENCY_KEY, DEFAULT_OVERSUBSCRIBE_FREQUENCY) if oversubscribe_frequency > 0: schedule.every(oversubscribe_frequency + random_jitter).seconds.do(self.__oversubscribe) predict_resource_usage_frequency = config_manager.get_float(PREDICT_RESOURCE_USAGE_FREQUENCY_KEY, DEFAULT_PREDICT_RESOURCE_USAGE_FREQUENCY) if predict_resource_usage_frequency > 0: schedule.every(predict_resource_usage_frequency + random_jitter).seconds.do(self.__predict_usage)
def __init__(self, free_thread_provider): config_manager = get_config_manager() self.__endpoint = config_manager.get_cached_str( GRPC_REMOTE_ALLOC_ENDPOINT, None) if self.__endpoint is None: raise Exception("Could not get remote allocator endpoint address.") self.__call_timeout_secs = 1000.0 * config_manager.get_cached_int( GRPC_REMOTE_ALLOC_CLIENT_CALL_TIMEOUT_MS, GRPC_REMOTE_ALLOC_DEFAULT_CLIENT_CALL_TIMEOUT_MS) self.__stub = self.__create_stub() self.__instance_ctx = self.__pull_context() self.__reg = None self.__empty_cpu = get_cpu_from_env() self.__natural2original_indexing = self.__empty_cpu.get_natural_indexing_2_original_indexing( ) self.__original2natural_indexing = { v: k for k, v in self.__natural2original_indexing.items() }
def __init__(self, cpu: Cpu, cgroup_manager: CgroupManager, cpu_allocator: CpuAllocator): self.__reg = None self.__lock = Lock() self.__instance_id = get_config_manager().get_str(EC2_INSTANCE_ID) self.__cpu_allocator = cpu_allocator self.__error_count = 0 self.__added_count = 0 self.__removed_count = 0 self.__rebalanced_count = 0 self.__added_to_full_cpu_count = 0 self.__allocator_call_duration_sum_secs = 0 self.__cpu = cpu self.__cgroup_manager = cgroup_manager self.__wmm = get_workload_monitor_manager() self.__workloads = {} log.info("Created workload manager")
def __init__(self, exit_handler: ExitHandler): self.__exit_handler = exit_handler self.__config_manager = get_config_manager() self.__node_name = self.__config_manager.get_str(EC2_INSTANCE_ID) kubeconfig = self.get_kubeconfig_path() self.__core_api = kubernetes.client.CoreV1Api( kubernetes.config.new_client_from_config(config_file=kubeconfig)) self.__custom_api = kubernetes.client.CustomObjectsApi( kubernetes.config.new_client_from_config(config_file=kubeconfig)) self.__lock = Lock() self.__opportunistic_resources = {} oversubscribe_frequency = self.__config_manager.get_float( OVERSUBSCRIBE_FREQUENCY_KEY, DEFAULT_OVERSUBSCRIBE_FREQUENCY) if oversubscribe_frequency > 0: watch_thread = Thread(target=self.__watch) watch_thread.start() else: log.info( "Skipping opportunistic resource watch, as opportunistic publishing is not configured." )