def get_url(config_manager: ConfigManager) -> Optional[str]: url_format = config_manager.get_str(PREDICTION_SERVICE_URL_FORMAT_STR) if url_format is None: return None return url_format.format(config_manager.get_region(), config_manager.get_environment())
def test_something_to_something_update(self): property_provider = TestPropertyProvider({CPU_ALLOCATOR: IP}) config_manager = ConfigManager(property_provider) self.assertEqual(IP, config_manager.get_str(CPU_ALLOCATOR)) property_provider.map[CPU_ALLOCATOR] = GREEDY self.assertEqual(GREEDY, config_manager.get_str(CPU_ALLOCATOR))
def test_real_instance_ids(self): even_instance_id = 'i-0cfefd19c9a8db976' property_provider = TestPropertyProvider({ ALLOCATOR_KEY: AB_TEST, CPU_ALLOCATOR_A: IP, CPU_ALLOCATOR_B: GREEDY, EC2_INSTANCE_ID: even_instance_id }) config_manager = ConfigManager(property_provider) allocator_class = get_allocator_class(config_manager, 12) self.assertEqual(IntegerProgramCpuAllocator, allocator_class) odd_instance_id = 'i-0cfefd19c9a8db977' property_provider = TestPropertyProvider({ ALLOCATOR_KEY: AB_TEST, CPU_ALLOCATOR_A: IP, CPU_ALLOCATOR_B: GREEDY, EC2_INSTANCE_ID: odd_instance_id }) config_manager = ConfigManager(property_provider) allocator_class = get_allocator_class(config_manager, 12) self.assertEqual(GreedyCpuAllocator, allocator_class)
def test_nothing_to_no_change_update(self): property_provider = TestPropertyProvider({}) exit_handler = TestExitHandler() config_manager = ConfigManager(property_provider) self.assertEqual(None, config_manager.get_str(CPU_ALLOCATOR)) watcher = RestartPropertyWatcher(config_manager, exit_handler, [CPU_ALLOCATOR]) watcher.detect_changes() self.assertEqual(None, exit_handler.last_code)
def test_something_to_no_change_update(self): property_provider = TestPropertyProvider({}) exit_handler = TestExitHandler() config_manager = ConfigManager(property_provider, CONFIG_CHANGE_INTERVAL) self.assertEqual(None, config_manager.get(ALLOCATOR_KEY)) watcher = CpuAllocatorWatcher(config_manager, exit_handler, CONFIG_CHANGE_INTERVAL) watcher.detect_allocator_change() self.assertEqual(None, exit_handler.last_code)
def test_none_to_something_update(self): property_provider = TestPropertyProvider({}) exit_handler = TestExitHandler() config_manager = ConfigManager(property_provider) self.assertEqual(None, config_manager.get_str(CPU_ALLOCATOR)) watcher = RestartPropertyWatcher(config_manager, exit_handler, [CPU_ALLOCATOR]) property_provider.map[CPU_ALLOCATOR] = GREEDY watcher.detect_changes() self.assertEqual(GENERIC_PROPERTY_CHANGE_EXIT, exit_handler.last_code)
def test_forecast_threshold_no_usage(self): allocator = ForecastIPCpuAllocator( TestCpuUsagePredictorManager(), ConfigManager(TestPropertyProvider({})), OversubscribeFreeThreadProvider(0.1)) thread_count = DEFAULT_TOTAL_THREAD_COUNT / 2 cpu = get_cpu() w0 = get_test_workload(uuid.uuid4(), thread_count, STATIC) request = get_no_usage_threads_request(cpu, [w0]) cpu = allocator.assign_threads(request).get_cpu() log.info(cpu) # All cores should be occupied for c in cpu.get_cores(): self.assertEqual(1, len(c.get_empty_threads())) w1 = get_test_workload(uuid.uuid4(), thread_count, BURST) request = get_no_usage_threads_request(cpu, [w0, w1]) cpu = allocator.assign_threads(request).get_cpu() log.info(cpu) # No threads should be shared for c in cpu.get_cores(): self.assertEqual(c.get_threads()[0].get_workload_ids(), c.get_threads()[1].get_workload_ids())
def __get_credential_path(config_manager: ConfigManager, file_name: str) -> Optional[str]: credentials_path = config_manager.get_str(CREDENTIALS_PATH) if credentials_path is None: return None return os.path.join(credentials_path, file_name)
def test_forecast_ip_burst_pool_with_usage(self): class UsagePredictorWithBurst: def __init__(self): self.__model = TestPredictor() def predict(self, workload: Workload, cpu_usage_last_hour: np.array, pred_env: PredEnvironment) -> float: if workload.get_id() == 'static_a': return workload.get_thread_count() * 0.8 elif workload.get_id() == 'static_b': return workload.get_thread_count() * 0.01 elif workload.get_id() == 'burst_c': return workload.get_thread_count() * 0.9 def get_model(self): return self.__model upm = TestCpuUsagePredictorManager(UsagePredictorWithBurst()) cm = ConfigManager( TestPropertyProvider({BURST_CORE_COLLOC_USAGE_THRESH: 0.9})) allocator = ForecastIPCpuAllocator( upm, cm, OversubscribeFreeThreadProvider(0.1)) cpu = get_cpu(package_count=2, cores_per_package=16) w_a = get_test_workload("static_a", 14, STATIC) w_b = get_test_workload("static_b", 14, STATIC) w_c = get_test_workload("burst_c", 2, BURST) request = get_no_usage_threads_request(cpu, [w_a]) cpu = allocator.assign_threads(request).get_cpu() request = get_no_usage_threads_request(cpu, [w_a, w_c]) cpu = allocator.assign_threads(request).get_cpu() # with an aggressive burst pool expansion, burst should be collocated with static on cores: self.assertLess(40, len(cpu.get_claimed_threads())) num_burst_1 = len(cpu.get_workload_ids_to_thread_ids()[w_c.get_id()]) request = get_no_usage_threads_request(cpu, [w_a, w_c, w_b]) cpu = allocator.assign_threads(request).get_cpu() # burst should retract, and prefer collocation with b over a: num_burst_2 = len(cpu.get_workload_ids_to_thread_ids()[w_c.get_id()]) self.assertLessEqual(num_burst_2, num_burst_1) colloc_a = 0 colloc_b = 0 for p in cpu.get_packages(): for c in p.get_cores(): t1 = c.get_threads()[0] t2 = c.get_threads()[1] if t1.is_claimed() and t2.is_claimed(): wt1 = t1.get_workload_ids()[0] wt2 = t2.get_workload_ids()[0] if (wt1 == w_a.get_id() and wt2 == w_c.get_id()) or ( wt1 == w_c.get_id() and wt2 == w_a.get_id()): colloc_a += 1 elif (wt1 == w_b.get_id() and wt2 == w_c.get_id()) or ( wt1 == w_c.get_id() and wt2 == w_b.get_id()): colloc_b += 1 self.assertLessEqual(colloc_a, colloc_b)
def test_publish_window(self): set_config_manager(ConfigManager(TestPropertyProvider({}))) set_workload_monitor_manager(TestWorkloadMonitorManager()) window_publisher = TestOpportunisticWindowPublisher( get_current_end_func=lambda: datetime.utcnow() - timedelta(minutes= 1), add_window_func=lambda: None, ) w_id = str(uuid.uuid4()) workload = get_test_workload(w_id, 1, STATIC) set_cpu_usage_predictor_manager( TestCpuUsagePredictorManager( TestSimpleCpuPredictor({w_id: DEFAULT_TOTAL_THRESHOLD - 0.001}))) oeh = OversubscribeEventHandler(TestWorkloadManager([workload]), window_publisher) oeh._handle(json.loads(OVERSUBSCRIBE_EVENT.decode("utf-8"))) self.assertEqual(0, oeh.get_skip_count()) self.assertEqual(1, oeh.get_success_count()) self.assertEqual(1, window_publisher.get_current_end_count) self.assertEqual(1, window_publisher.add_window_count)
def test_get_noop_reset_cpu_allocator(self): property_provider = TestPropertyProvider( { CPU_ALLOCATOR: NOOP_RESET }) config_manager = ConfigManager(property_provider) allocator = get_fallback_allocator(config_manager) self.assertEqual(NoopResetCpuAllocator, allocator.get_primary_allocator().__class__)
def get_free_thread_provider( config_manager: ConfigManager) -> FreeThreadProvider: free_thread_provider_str = config_manager.get_str( FREE_THREAD_PROVIDER, DEFAULT_FREE_THREAD_PROVIDER) free_thread_provider = None total_threshold = config_manager.get_float(TOTAL_THRESHOLD, DEFAULT_TOTAL_THRESHOLD) if free_thread_provider_str == EMPTY: free_thread_provider = EmptyFreeThreadProvider() elif free_thread_provider_str == OVERSUBSCRIBE: free_thread_provider = OversubscribeFreeThreadProvider(total_threshold) log.debug("Free thread provider: '{}'".format( free_thread_provider.__class__.__name__)) return free_thread_provider
def get_config_manager(property_provider=AgentPropertyProvider()): global __config_manager with config_manager_lock: if __config_manager is None: __config_manager = ConfigManager(property_provider) return __config_manager
def test_get_ab_bucket_undefined(self): property_provider = TestPropertyProvider({ ALLOCATOR_KEY: AB_TEST, CPU_ALLOCATOR_A: IP, CPU_ALLOCATOR_B: GREEDY }) config_manager = ConfigManager(property_provider) self.assertEqual("UNDEFINED", get_ab_bucket(config_manager, 12))
def test_ab_allocator_fallback(self): property_provider = TestPropertyProvider({ALLOCATOR_KEY: AB_TEST}) config_manager = ConfigManager(property_provider) allocator_class = get_allocator_class(config_manager) self.assertEqual(NoopCpuAllocator, allocator_class) allocator_class = get_allocator_class(config_manager) self.assertEqual(NoopCpuAllocator, allocator_class)
def test_single_workload_memory_settings(self): for allocator in ALLOCATORS: thread_count = 2 workload = get_test_workload(uuid.uuid4(), thread_count, STATIC) cgroup_manager = MockCgroupManager() workload_manager = WorkloadManager(get_cpu(), cgroup_manager, allocator) # With an empty configuration we should expect default False behavior # for all memory flags set_config_manager(ConfigManager(TestPropertyProvider({}))) workload_manager.add_workload(workload) self.assertFalse( cgroup_manager.get_memory_migrate(workload.get_id())) self.assertFalse( cgroup_manager.get_memory_spread_page(workload.get_id())) self.assertFalse( cgroup_manager.get_memory_spread_slab(workload.get_id())) workload_manager.remove_workload(workload.get_id()) # With all memory configuration options set to True we should expect all memory # flags to be set to True set_config_manager( ConfigManager( TestPropertyProvider({ TITUS_ISOLATE_MEMORY_MIGRATE: True, TITUS_ISOLATE_MEMORY_SPREAD_PAGE: True, TITUS_ISOLATE_MEMORY_SPREAD_SLAB: True, }))) workload_manager.add_workload(workload) self.assertTrue( cgroup_manager.get_memory_migrate(workload.get_id())) self.assertTrue( cgroup_manager.get_memory_spread_page(workload.get_id())) self.assertTrue( cgroup_manager.get_memory_spread_slab(workload.get_id())) workload_manager.remove_workload(workload.get_id())
def test_undefined_instance_id(self): property_provider = TestPropertyProvider({ ALLOCATOR_KEY: AB_TEST, CPU_ALLOCATOR_A: IP, CPU_ALLOCATOR_B: GREEDY }) config_manager = ConfigManager(property_provider) allocator_class = get_allocator_class(config_manager) self.assertEqual(NoopCpuAllocator, allocator_class)
def test_get_ab_bucket(self): even_instance_id = 'i-0cfefd19c9a8db976' property_provider = TestPropertyProvider({ ALLOCATOR_KEY: AB_TEST, CPU_ALLOCATOR_A: IP, CPU_ALLOCATOR_B: GREEDY, EC2_INSTANCE_ID: even_instance_id }) config_manager = ConfigManager(property_provider) self.assertEqual("A", get_ab_bucket(config_manager, 12)) odd_instance_id = 'i-0cfefd19c9a8db977' property_provider = TestPropertyProvider({ ALLOCATOR_KEY: AB_TEST, CPU_ALLOCATOR_A: IP, CPU_ALLOCATOR_B: GREEDY, EC2_INSTANCE_ID: odd_instance_id }) config_manager = ConfigManager(property_provider) self.assertEqual("B", get_ab_bucket(config_manager, 12)) letter_instance_id = 'i-0cfefd19c9a8db97x' property_provider = TestPropertyProvider({ ALLOCATOR_KEY: AB_TEST, CPU_ALLOCATOR_A: IP, CPU_ALLOCATOR_B: GREEDY, EC2_INSTANCE_ID: letter_instance_id }) config_manager = ConfigManager(property_provider) self.assertEqual("A", get_ab_bucket(config_manager, 12))
def test_skip_active_window(self): set_config_manager(ConfigManager(TestPropertyProvider({}))) window_publisher = TestOpportunisticWindowPublisher( is_window_active_func=lambda: True, add_window_func=lambda: None, cleanup_func=lambda: None) oeh = OversubscribeEventHandler(TestWorkloadManager([]), window_publisher) oeh._handle(json.loads(OVERSUBSCRIBE_EVENT.decode("utf-8"))) self.assertEqual(1, oeh.get_skip_count()) self.assertEqual(1, window_publisher.is_window_active_count)
def test_skip_active_window(self): set_config_manager(ConfigManager(TestPropertyProvider({}))) window_publisher = TestOpportunisticWindowPublisher( get_current_end_func=lambda: datetime.utcnow() + timedelta(minutes= 5), add_window_func=lambda: None, ) oeh = OversubscribeEventHandler(TestWorkloadManager([]), window_publisher) oeh._handle(json.loads(OVERSUBSCRIBE_EVENT.decode("utf-8"))) self.assertEqual(1, oeh.get_skip_count()) self.assertEqual(1, window_publisher.get_current_end_count)
def test_noop_to_ip_update(self): property_provider = TestPropertyProvider({CPU_ALLOCATOR: NOOP}) exit_handler = TestExitHandler() config_manager = ConfigManager(property_provider) watcher = RestartPropertyWatcher(config_manager, exit_handler, [CPU_ALLOCATOR]) # No change yet watcher.detect_changes() self.assertEqual(None, exit_handler.last_code) # titus-isolate should exit when the allocator changes property_provider.map[CPU_ALLOCATOR] = IP watcher.detect_changes() self.assertEqual(ALLOCATOR_CONFIG_CHANGE_EXIT, exit_handler.last_code)
def health_check(local_exit_handler: ExitHandler, local_event_manager: EventManager, config_manager: ConfigManager): last_processed_event_time_epoch_s = datetime.datetime.utcnow().timestamp() while True: try: threshold = config_manager.get_int( MAX_TIME_SINCE_LAST_SUCCESSFUL_EVENT_KEY, DEFAULT_MAX_TIME_SINCE_LAST_SUCCESSFUL_EVENT) health_check_interval = config_manager.get_int( HEALTH_CHECK_FREQUENCY_KEY, DEFAULT_HEALTH_CHECK_FREQUENCY) last_event_epoch_s = local_event_manager.last_successful_event_epoch_s if last_event_epoch_s > 0: last_processed_event_time_epoch_s = last_event_epoch_s time_since_last_event = datetime.datetime.utcnow().timestamp( ) - last_processed_event_time_epoch_s log.info( "Seconds since last successful event processing: %s, threshold: %s", time_since_last_event, threshold) if time_since_last_event > threshold: log.info( "Event processing is not completing as expected, exiting..." ) local_exit_handler.exit(HEALTH_CHECK_FAILURE_EXIT_CODE) log.info("Next health check in %s seconds", health_check_interval) time.sleep(health_check_interval) except: log.exception("Failed to healthcheck") local_exit_handler.exit(HEALTH_CHECK_EXCEPTION_EXIT_CODE)
def test_noop_to_ip_update(self): property_provider = TestPropertyProvider( { ALLOCATOR_KEY: NOOP }) exit_handler = TestExitHandler() config_manager = ConfigManager(property_provider, CONFIG_CHANGE_INTERVAL) watcher = CpuAllocatorWatcher(config_manager, exit_handler, CONFIG_CHANGE_INTERVAL) # No change yet watcher.detect_allocator_change() self.assertEqual(None, exit_handler.last_code) # titus-isolate should exit when the allocator changes property_provider.map[ALLOCATOR_KEY] = IP watcher.detect_allocator_change() self.assertEqual(ALLOCATOR_CONFIG_CHANGE_EXIT, exit_handler.last_code)
def __init__( self, config_manager: ConfigManager, exit_handler: ExitHandler, properties: List[str], detection_interval: int = PROPERTY_CHANGE_DETECTION_INTERVAL_SEC): self.__config_manager = config_manager self.__exit_handler = exit_handler self.__properties = properties log.info("Starting watching for changes to properties: {}".format( properties)) for p in properties: v = config_manager.get_cached_str(p) log.info("{}: {}".format(p, v)) schedule.every(detection_interval).seconds.do(self.detect_changes)
def test_forecast_threshold_usage(self): allocator = ForecastIPCpuAllocator( TestCpuUsagePredictorManager(TestCpuUsagePredictor(10)), ConfigManager(TestPropertyProvider({})), ThresholdFreeThreadProvider(0.05)) thread_count = DEFAULT_TOTAL_THREAD_COUNT / 4 cpu = get_cpu() w0 = get_test_workload(uuid.uuid4(), thread_count, STATIC) log.info(w0) request = AllocateThreadsRequest(cpu, w0.get_id(), {w0.get_id(): w0}, {}, DEFAULT_TEST_REQUEST_METADATA) cpu = allocator.assign_threads(request).get_cpu() log.info(cpu) # All cores should be occupied for c in cpu.get_cores(): self.assertTrue( len(c.get_empty_threads()) == 1 or len(c.get_empty_threads()) == 2) w1 = get_test_workload(uuid.uuid4(), thread_count, BURST) log.info(w1) request = AllocateThreadsRequest(cpu, w1.get_id(), { w0.get_id(): w0, w1.get_id(): w1 }, {}, DEFAULT_TEST_REQUEST_METADATA) cpu = allocator.assign_threads(request).get_cpu() log.info(cpu) for c in cpu.get_cores(): # Static workload should have unshared cores if len(c.get_empty_threads()) == 1: for t in c.get_threads(): if t.is_claimed(): self.assertEqual([w0.get_id()], t.get_workload_ids()) # Burst workload should have shared cores only with itself if len(c.get_empty_threads()) == 0: self.assertEqual(c.get_threads()[0].get_workload_ids(), c.get_threads()[1].get_workload_ids()) self.assertEqual([w1.get_id()], c.get_threads()[1].get_workload_ids())
def test_get_workloads_endpoint(self): override_config_manager(ConfigManager(TestPropertyProvider({}))) cpu = get_cpu() thread_count = 2 workload_id = str(uuid.uuid4()) workload = Workload(workload_id, thread_count, STATIC) workload_manager = WorkloadManager(cpu, MockCgroupManager()) set_wm(workload_manager) workloads = json.loads(get_workloads()) self.assertEqual(0, len(workloads)) workload_manager.add_workload(workload) workloads = json.loads(get_workloads()) self.assertEqual(workload_id, workloads[0]["id"]) self.assertEqual(STATIC, workloads[0]["type"]) self.assertEqual(thread_count, workloads[0]["thread_count"])
def test_get_workloads_endpoint(self): set_config_manager(ConfigManager(TestPropertyProvider({}))) thread_count = 2 workload_id = str(uuid.uuid4()) workload = get_test_workload(workload_id, thread_count, STATIC) workload_manager = self.__get_default_workload_manager() set_workload_manager(workload_manager) workloads = json.loads(get_workloads()) self.assertEqual(0, len(workloads)) workload_manager.add_workload(workload) workloads = json.loads(get_workloads()) self.assertEqual(workload_id, workloads[0]["id"]) self.assertEqual(STATIC, workloads[0]["type"]) self.assertEqual(thread_count, workloads[0]["thread_count"])
def test_ab_classification_swap(self): even_instance_id = 'i-0cfefd19c9a8db976' property_provider = TestPropertyProvider( { ALLOCATOR_KEY: AB_TEST, CPU_ALLOCATOR_A: NOOP, CPU_ALLOCATOR_B: IP, EC2_INSTANCE_ID: even_instance_id }) exit_handler = TestExitHandler() config_manager = ConfigManager(property_provider, CONFIG_CHANGE_INTERVAL) watcher = CpuAllocatorWatcher(config_manager, exit_handler, CONFIG_CHANGE_INTERVAL) # No change yet watcher.detect_allocator_change() self.assertEqual(None, exit_handler.last_code) # Swap A and B to simulate instance classification change # N.B. the ALLOCATOR_KEY and EC_INSTANCE_ID do NOT change property_provider.map[CPU_ALLOCATOR_A] = IP property_provider.map[CPU_ALLOCATOR_B] = NOOP watcher.detect_allocator_change() self.assertEqual(ALLOCATOR_CONFIG_CHANGE_EXIT, exit_handler.last_code)
def __init__(self, cpu_usage_predictor_manager: CpuUsagePredictorManager, config_manager: ConfigManager, free_thread_provider: FreeThreadProvider): self.__reg = None self.__time_bound_call_count = 0 self.__rebalance_failure_count = 0 self.__ip_solver_params = IPSolverParameters( alpha_nu=config_manager.get_float(ALPHA_NU, DEFAULT_ALPHA_NU), alpha_llc=config_manager.get_float(ALPHA_LLC, DEFAULT_ALPHA_LLC), alpha_l12=config_manager.get_float(ALPHA_L12, DEFAULT_ALPHA_L12), alpha_order=config_manager.get_float(ALPHA_ORDER, DEFAULT_ALPHA_ORDER), alpha_prev=config_manager.get_float(ALPHA_PREV, DEFAULT_ALPHA_PREV)) self.__solver_max_runtime_secs = config_manager.get_float(MAX_SOLVER_RUNTIME, DEFAULT_MAX_SOLVER_RUNTIME) self.__solver_name = config_manager.get_str(MIP_SOLVER, DEFAULT_MIP_SOLVER) self.__solver_mip_gap = config_manager.get_float(RELATIVE_MIP_GAP_STOP, DEFAULT_RELATIVE_MIP_GAP_STOP) self.__cpu_usage_predictor_manager = cpu_usage_predictor_manager self.__config_manager = config_manager self.__free_thread_provider = free_thread_provider self.__cnt_rebalance_calls = 0 self.__call_meta = None # track things __place_threads call
log.info("Isolating currently running workloads...") for workload in get_current_workloads(docker.from_env()): try: workload_manager.add_workload(workload) except: log.exception( "Failed to add currently running workload: '{}', maybe it exited." .format(workload.get_id())) log.info("Isolated currently running workloads.") # Start processing events after adding running workloads to avoid processing a die event before we add a workload event_manager.start_processing_events() if __name__ != '__main__' and not is_testing(): set_config_manager(ConfigManager(EnvPropertyProvider)) log.info("Configuring logging...") gunicorn_logger = logging.getLogger('gunicorn.error') app.logger.handlers = gunicorn_logger.handlers app.logger.setLevel(gunicorn_logger.level) # Set the schedule library's logging level higher so it doesn't spam messages every time it schedules a task logging.getLogger('schedule').setLevel(logging.WARN) exit_handler = RealExitHandler() if is_kubernetes(): log.info("Setting pod manager...") pod_manager = PodManager() pod_manager.start() set_pod_manager(pod_manager)