def test_no_cross_package_violation(self): cpu = get_cpu() allocator = IntegerProgramCpuAllocator(cpu) w = Workload(uuid.uuid4(), 4, STATIC) violations = get_cross_package_violations(cpu) self.assertEqual(0, len(violations)) allocator.assign_threads(w) violations = get_cross_package_violations(cpu) self.assertEqual(0, len(violations))
def test_no_cross_package_violation(self): cpu = get_cpu() allocator = IntegerProgramCpuAllocator() w = get_test_workload(uuid.uuid4(), 4, STATIC) violations = get_cross_package_violations(cpu) self.assertEqual(0, len(violations)) request = get_no_usage_threads_request(cpu, [w]) cpu = allocator.assign_threads(request).get_cpu() violations = get_cross_package_violations(cpu) self.assertEqual(0, len(violations))
def test_no_cross_package_violation(self): cpu = get_cpu() allocator = IntegerProgramCpuAllocator() w = get_test_workload(uuid.uuid4(), 4, STATIC) violations = get_cross_package_violations(cpu) self.assertEqual(0, len(violations)) request = AllocateThreadsRequest(cpu, w.get_id(), {w.get_id(): w}, {}, DEFAULT_TEST_REQUEST_METADATA) cpu = allocator.assign_threads(request).get_cpu() violations = get_cross_package_violations(cpu) self.assertEqual(0, len(violations))
def get_violations(): return json.dumps({ "cross_package": get_cross_package_violations(get_workload_manager().get_cpu()), "shared_core": get_shared_core_violations(get_workload_manager().get_cpu()) })
def report_metrics(self, tags): cpu = self.get_cpu_copy() workload_map = self.get_workload_map_copy() self.__reg.gauge(RUNNING, tags).set(1) self.__reg.gauge(ADDED_KEY, tags).set(self.get_added_count()) self.__reg.gauge(REMOVED_KEY, tags).set(self.get_removed_count()) self.__reg.gauge(REBALANCED_KEY, tags).set(self.get_rebalanced_count()) self.__reg.gauge(SUCCEEDED_KEY, tags).set(self.get_success_count()) self.__reg.gauge(FAILED_KEY, tags).set(self.get_error_count()) self.__reg.gauge(WORKLOAD_COUNT_KEY, tags).set(len(self.get_workloads())) self.__reg.gauge(ADDED_TO_FULL_CPU_ERROR_KEY, tags).set(self.__added_to_full_cpu_count) self.__reg.gauge(ALLOCATOR_CALL_DURATION, tags).set(self.get_allocator_call_duration_sum_secs()) cross_package_violation_count = len(get_cross_package_violations(cpu)) shared_core_violation_count = len(get_shared_core_violations(cpu)) self.__reg.gauge(PACKAGE_VIOLATIONS_KEY, tags).set(cross_package_violation_count) self.__reg.gauge(CORE_VIOLATIONS_KEY, tags).set(shared_core_violation_count) # Allocation / Request sizes self.__reg.gauge(ALLOCATED_SIZE_KEY, tags).set(get_allocated_size(cpu)) self.__reg.gauge(UNALLOCATED_SIZE_KEY, tags).set(get_unallocated_size(cpu)) self.__reg.gauge(STATIC_ALLOCATED_SIZE_KEY, tags).set(get_static_allocated_size(cpu, workload_map)) self.__reg.gauge(BURST_ALLOCATED_SIZE_KEY, tags).set(get_burst_allocated_size(cpu, workload_map)) self.__reg.gauge(BURST_REQUESTED_SIZE_KEY, tags).set(get_burst_request_size(list(workload_map.values()))) self.__reg.gauge(OVERSUBSCRIBED_THREADS_KEY, tags).set(get_oversubscribed_thread_count(cpu, workload_map)) # Have the sub-components report metrics self.__cpu_allocator.report_metrics(tags) self.__cgroup_manager.report_metrics(tags)
def report_metrics(self, tags): cpu = self.get_cpu_copy() workload_map = self.get_workload_map_copy() self.__reg.gauge(RUNNING, tags).set(1) self.__reg.gauge(WORKLOAD_COUNT_KEY, tags).set(len(self.get_workloads())) self.__reg.counter(ADDED_KEY, tags).increment(self.get_added_count()) self.__reg.counter(REMOVED_KEY, tags).increment(self.get_removed_count()) self.__reg.counter(REBALANCED_KEY, tags).increment(self.get_rebalanced_count()) self.__reg.counter(SUCCEEDED_KEY, tags).increment(self.get_success_count()) self.__reg.counter(FAILED_KEY, tags).increment(self.get_error_count()) self.__added_count = 0 self.__removed_count = 0 self.__rebalanced_count = 0 self.__error_count = 0 cross_package_violation_count = len(get_cross_package_violations(cpu)) shared_core_violation_count = len(get_shared_core_violations(cpu)) self.__reg.gauge(PACKAGE_VIOLATIONS_KEY, tags).set(cross_package_violation_count) self.__reg.gauge(CORE_VIOLATIONS_KEY, tags).set(shared_core_violation_count) # Allocation / Request sizes self.__reg.gauge(ALLOCATED_SIZE_KEY, tags).set(get_allocated_size(cpu)) self.__reg.gauge(UNALLOCATED_SIZE_KEY, tags).set(get_unallocated_size(cpu)) self.__reg.gauge(STATIC_ALLOCATED_SIZE_KEY, tags).set( get_static_allocated_size(cpu, workload_map)) self.__reg.gauge(BURST_ALLOCATED_SIZE_KEY, tags).set(get_burst_allocated_size(cpu, workload_map)) self.__reg.gauge(BURST_REQUESTED_SIZE_KEY, tags).set( get_burst_request_size(list(workload_map.values()))) self.__reg.gauge(OVERSUBSCRIBED_THREADS_KEY, tags).set( get_oversubscribed_thread_count(cpu, workload_map)) self.__reg.gauge(BURSTABLE_THREADS_KEY, tags).set( self.__get_free_thread_count(self.__last_response)) self.__reg.gauge(OVERSUBSCRIBABLE_THREADS_KEY, tags).set( self.__get_oversubscribable_thread_count(self.__last_response)) self.__reg.gauge(OVERSUBSCRIBE_CONSUMED_CPU_COUNT, tags).set( self.__get_consumed_opportunistic_cpu_count()) # Have the sub-components report metrics self.__cpu_allocator.report_metrics(tags) self.__cgroup_manager.report_metrics(tags)
def report_metrics(self, tags): log.debug("Reporting metrics") try: # Workload manager metrics self.__reg.gauge(RUNNING, tags).set(1) self.__reg.gauge(ADDED_KEY, tags).set( self.__workload_manager.get_added_count()) self.__reg.gauge(REMOVED_KEY, tags).set( self.__workload_manager.get_removed_count()) self.__reg.gauge(SUCCEEDED_KEY, tags).set( self.__workload_manager.get_success_count()) self.__reg.gauge(FAILED_KEY, tags).set( self.__workload_manager.get_error_count()) self.__reg.gauge(WORKLOAD_COUNT_KEY, tags).set( len(self.__workload_manager.get_workloads())) # Allocator metrics self.__reg.gauge(ALLOCATOR_CALL_DURATION, tags).set( self.__workload_manager.get_allocator_call_duration_sum_secs()) self.__reg.gauge(FALLBACK_ALLOCATOR_COUNT, tags).set( self.__workload_manager.get_fallback_allocator_calls_count()) self.__reg.gauge(IP_ALLOCATOR_TIMEBOUND_COUNT, tags).set( self.__workload_manager. get_time_bound_ip_allocator_solution_count()) # Event manager metrics self.__reg.gauge(QUEUE_DEPTH_KEY, tags).set(self.__event_manager.get_queue_depth()) self.__reg.gauge(EVENT_SUCCEEDED_KEY, tags).set( self.__event_manager.get_success_count()) self.__reg.gauge(EVENT_FAILED_KEY, tags).set(self.__event_manager.get_error_count()) self.__reg.gauge(EVENT_PROCESSED_KEY, tags).set( self.__event_manager.get_processed_count()) # CPU metrics cross_package_violation_count = len( get_cross_package_violations( self.__workload_manager.get_cpu())) shared_core_violation_count = len( get_shared_core_violations(self.__workload_manager.get_cpu())) self.__reg.gauge(PACKAGE_VIOLATIONS_KEY, tags).set(cross_package_violation_count) self.__reg.gauge(CORE_VIOLATIONS_KEY, tags).set(shared_core_violation_count) log.debug("Reported metrics") except: log.exception("Failed to report metric")
def test_no_cross_packages_placement_no_bad_affinity_ip(self): w_a = get_test_workload("a", 3, STATIC) w_b = get_test_workload("b", 2, STATIC) w_c = get_test_workload("c", 1, STATIC) w_d = get_test_workload("d", 2, STATIC) cpu = get_cpu(package_count=2, cores_per_package=2, threads_per_core=2) workload_manager = WorkloadManager(cpu, MockCgroupManager(), IntegerProgramCpuAllocator()) workload_manager.add_workload(w_a) workload_manager.add_workload(w_b) workload_manager.add_workload(w_c) workload_manager.add_workload(w_d) self.assertEqual( 0, len(get_cross_package_violations(workload_manager.get_cpu()))) self.assertEqual(0, len(workload_manager.get_cpu().get_empty_threads()))
def test_no_cross_packages_placement_no_bad_affinity_ip(self): w_a = Workload("a", 3, STATIC) w_b = Workload("b", 2, STATIC) w_c = Workload("c", 1, STATIC) w_d = Workload("d", 2, STATIC) cpu = get_cpu(package_count=2, cores_per_package=2, threads_per_core=2) workload_manager = WorkloadManager(cpu, MockCgroupManager()) workload_manager.add_workload(w_a) workload_manager.add_workload(w_b) workload_manager.add_workload(w_c) workload_manager.add_workload(w_d) self.assertEqual( 0, len(get_cross_package_violations(workload_manager.get_cpu()))) #self.assertEqual(1, len(get_shared_core_violations(workload_manager.get_cpu()))) # todo: fix me self.assertEqual(0, len(workload_manager.get_cpu().get_empty_threads()))
def has_better_isolation(cur_cpu, new_cpu): """ Here we determine whether a proposed placement of workloads improves upon the current workload placement. There are two possible violations: 1. shared_core: a core is shared by multiple workloads 2. cross_package: a workload is placed on multiple packages Below we describe a matrix answering the follow question: The new workload placement is better? There are 9 possible cases to consider. '-' indicates a decrease '0' indicates no change '+' indicates an increase 'count' refers to how many workloads are affected by the violation shared_core_count - 0 + - T T T cross_package_count 0 T F F + F F F Only two controversial answers exist. They are in the lower-left and upper-right hand corners. lower-left: shared_core_count has decreased, but cross_package_count has increased upper-right: cross_package_count has decreased, but shared_core_count has increased In both cases we make the assumption that avoiding cross package workloads is to be preferred. NOTE: We do not consider the placement of burst workloads when comparing the two placement options. :return: True if the new_cpu has better placement, False otherwise """ cur_cross_package_violation_count = len( get_cross_package_violations(cur_cpu)) new_cross_package_violation_count = len( get_cross_package_violations(new_cpu)) cur_shared_core_violation_count = len(get_shared_core_violations(cur_cpu)) new_shared_core_violation_count = len(get_shared_core_violations(new_cpu)) # More violations is bad, so a positive change is bad cross_package_violation_change = new_cross_package_violation_count - cur_cross_package_violation_count shared_core_violation_change = new_shared_core_violation_count - cur_shared_core_violation_count # Top row of matrix if cross_package_violation_change < 0: return True # Bottom row of matrix if cross_package_violation_change > 0: return False # Middle row of matrix, can assume cross_package_violation_change == 0 return shared_core_violation_change < 0