def __init__(self, measurement_runner: MeasurementRunner, allocator: Allocator, allocations_storage: Storage = DEFAULT_STORAGE, anomalies_storage: Storage = DEFAULT_STORAGE, rdt_mb_control_required: bool = False, rdt_cache_control_required: bool = False, remove_all_resctrl_groups: bool = False): if not measurement_runner._allocation_configuration: measurement_runner._allocation_configuration = AllocationConfiguration( ) self._measurement_runner = measurement_runner # Allocation specific. self._allocator = allocator self._allocations_storage = allocations_storage self._rdt_mb_control_required = rdt_mb_control_required self._rdt_cache_control_required = rdt_cache_control_required # Anomaly. self._anomalies_storage = anomalies_storage self._anomalies_statistics = AnomalyStatistics() # Internal allocation statistics self._allocations_counter = 0 self._allocations_errors = 0 self._remove_all_resctrl_groups = remove_all_resctrl_groups self._measurement_runner._set_iterate_body_callback(self._iterate_body) self._measurement_runner._set_initialize_rdt_callback( self._initialize_rdt)
def __init__( self, node: nodes.Node, allocator: Allocator, metrics_storage: storage.Storage = DEFAULT_STORAGE, anomalies_storage: storage.Storage = DEFAULT_STORAGE, allocations_storage: storage.Storage = DEFAULT_STORAGE, action_delay: Numeric(0, 60) = 1., # [s] rdt_enabled: Optional[ bool] = None, # Defaults(None) - auto configuration. rdt_mb_control_required: bool = False, rdt_cache_control_required: bool = False, extra_labels: Dict[Str, Str] = None, allocation_configuration: Optional[AllocationConfiguration] = None, remove_all_resctrl_groups: bool = False, event_names: Optional[List[str]] = None, enable_derived_metrics: bool = False, task_label_generators: Dict[str, TaskLabelGenerator] = None, ): self._allocation_configuration = allocation_configuration or AllocationConfiguration( ) super().__init__( node, metrics_storage, action_delay, rdt_enabled, extra_labels, _allocation_configuration=self._allocation_configuration, event_names=event_names, enable_derived_metrics=enable_derived_metrics, task_label_generators=task_label_generators) # Allocation specific. self._allocator = allocator self._allocations_storage = allocations_storage self._rdt_mb_control_required = rdt_mb_control_required # Override False from superclass. self._rdt_cache_control_required = rdt_cache_control_required # Anomaly. self._anomalies_storage = anomalies_storage self._anomalies_statistics = AnomalyStatistics() # Internal allocation statistics self._allocations_counter = 0 self._allocations_errors = 0 self._remove_all_resctrl_groups = remove_all_resctrl_groups
class AllocationRunner(MeasurementRunner): """Runner is responsible for getting information about tasks from node, calling allocate() callback on allocator, performing returning allocations and storing all allocation related metrics in allocations_storage. Because Allocator interface is also detector, we store serialized detected anomalies in anomalies_storage and all other measurements in metrics_storage. Arguments: node: component used for tasks discovery allocator: component that provides allocation logic metrics_storage: storage to store platform, internal, resource and task metrics (defaults to DEFAULT_STORAGE/LogStorage to output for standard error) anomalies_storage: storage to store serialized anomalies and extra metrics (defaults to DEFAULT_STORAGE/LogStorage to output for standard error) allocations_storage: storage to store serialized resource allocations (defaults to DEFAULT_STORAGE/LogStorage to output for standard error) action_delay: iteration duration in seconds (None disables wait and iterations) (defaults to 1 second) rdt_enabled: enables or disabled support for RDT monitoring and allocation (defaults to None(auto) based on platform capabilities) rdt_mb_control_required: indicates that MB control is required, if the platform does not support this feature the WCA will exit rdt_cache_control_required: indicates tha L3 control is required, if the platform does not support this feature the WCA will exit extra_labels: additional labels attached to every metric (defaults to empty dict) allocation_configuration: allows fine grained control over allocations (defaults to AllocationConfiguration() instance) remove_all_resctrl_groups (bool): remove all RDT controls groups upon starting (defaults to False) event_names: perf counters to monitor (defaults to instructions, cycles, cache-misses, memstalls) enable_derived_metrics: enable derived metrics ips, ipc and cache_hit_ratio (based on enabled_event names), default to False task_label_generators: component to generate additional labels for tasks """ def __init__( self, node: nodes.Node, allocator: Allocator, metrics_storage: storage.Storage = DEFAULT_STORAGE, anomalies_storage: storage.Storage = DEFAULT_STORAGE, allocations_storage: storage.Storage = DEFAULT_STORAGE, action_delay: Numeric(0, 60) = 1., # [s] rdt_enabled: Optional[bool] = None, # Defaults(None) - auto configuration. rdt_mb_control_required: bool = False, rdt_cache_control_required: bool = False, extra_labels: Dict[Str, Str] = None, allocation_configuration: Optional[AllocationConfiguration] = None, remove_all_resctrl_groups: bool = False, event_names: Optional[List[str]] = DEFAULT_EVENTS, enable_derived_metrics: bool = False, task_label_generators: Dict[str, TaskLabelGenerator] = None, ): self._allocation_configuration = allocation_configuration or AllocationConfiguration() super().__init__(node, metrics_storage, action_delay, rdt_enabled, extra_labels, _allocation_configuration=self._allocation_configuration, event_names=event_names, enable_derived_metrics=enable_derived_metrics, task_label_generators=task_label_generators) # Allocation specific. self._allocator = allocator self._allocations_storage = allocations_storage self._rdt_mb_control_required = rdt_mb_control_required # Override False from superclass. self._rdt_cache_control_required = rdt_cache_control_required # Anomaly. self._anomalies_storage = anomalies_storage self._anomalies_statistics = AnomalyStatistics() # Internal allocation statistics self._allocations_counter = 0 self._allocations_errors = 0 self._remove_all_resctrl_groups = remove_all_resctrl_groups def _initialize_rdt(self) -> bool: platform, _, _ = platforms.collect_platform_information() # Cache control check. if self._rdt_cache_control_required and \ not platform.rdt_information.rdt_cache_control_enabled: # Wanted unavailable feature - halt log.error('RDT cache control enabled but is not supported by platform!') return False # MB control check. if self._rdt_mb_control_required and \ not platform.rdt_information.rdt_mb_control_enabled: # Some wanted unavailable feature - halt. log.error('RDT memory bandwidth enabled but ' 'allocation is not supported by platform!') return False # Prepare initial values for L3, MB... root_rdt_l3, root_rdt_mb = resctrl.get_max_rdt_values( platform.rdt_information.cbm_mask, platform.sockets, platform.rdt_information.rdt_mb_control_enabled, platform.rdt_information.rdt_cache_control_enabled ) # ...override max values with values from allocation configuration if self._allocation_configuration.default_rdt_l3 is not None and \ platform.rdt_information.rdt_cache_control_enabled: root_rdt_l3 = self._allocation_configuration.default_rdt_l3 if self._allocation_configuration.default_rdt_mb is not None and \ platform.rdt_information.rdt_mb_control_enabled: root_rdt_mb = self._allocation_configuration.default_rdt_mb try: if root_rdt_l3 is not None: validate_l3_string(root_rdt_l3, platform.sockets, platform.rdt_information.cbm_mask, platform.rdt_information.min_cbm_bits) if root_rdt_mb is not None: normalized_root_rdt_mb = normalize_mb_string( root_rdt_mb, platform.sockets, platform.rdt_information.mb_min_bandwidth, platform.rdt_information.mb_bandwidth_gran) resctrl.cleanup_resctrl( root_rdt_l3, normalized_root_rdt_mb, self._remove_all_resctrl_groups) else: resctrl.cleanup_resctrl( root_rdt_l3, root_rdt_mb, self._remove_all_resctrl_groups) except InvalidAllocations as e: log.error('Cannot initialize RDT subsystem: %s', e) return False return True def _iterate_body(self, containers, platform, tasks_measurements, tasks_resources, tasks_labels, common_labels): """Allocator callback body.""" current_allocations = _get_tasks_allocations(containers) # Allocator callback allocate_start = time.time() new_allocations, anomalies, extra_metrics = self._allocator.allocate( platform, tasks_measurements, tasks_resources, tasks_labels, current_allocations) allocate_duration = time.time() - allocate_start # Validate callback output _validate_allocate_return_vals(new_allocations, anomalies, extra_metrics) log.debug('Anomalies detected: %d', len(anomalies)) log.debug('Current allocations: %s', current_allocations) # Create context aware allocations objects for current allocations. current_allocations_values = TasksAllocationsValues.create( self._rdt_enabled, current_allocations, self._containers_manager.containers, platform) # Handle allocations: calculate changeset and target allocations. allocations_changeset_values = None target_allocations_values = current_allocations_values try: # Special validation step needed for Kubernetes. validate_shares_allocation_for_kubernetes(tasks=containers.keys(), allocations=new_allocations) # Create and validate context aware allocations objects for new allocations. log.debug('New allocations: %s', new_allocations) new_allocations_values = TasksAllocationsValues.create( self._rdt_enabled, new_allocations, self._containers_manager.containers, platform) new_allocations_values.validate() # Calculate changeset and target_allocations. if new_allocations_values is not None: target_allocations_values, allocations_changeset_values = \ new_allocations_values.calculate_changeset(current_allocations_values) target_allocations_values.validate() self._allocations_counter += len(new_allocations) except InvalidAllocations as e: # Handle any allocation validation error. # Log errors and restore current to generate proper metrics. log.error('Invalid allocations: %s', str(e)) log.warning('Ignoring all allocations in this iteration due to validation error!') self._allocations_errors += 1 target_allocations_values = current_allocations_values # Handle allocations: perform allocations based on changeset. if allocations_changeset_values: log.debug('Allocations changeset: %s', allocations_changeset_values) log.info('Performing allocations on %d tasks.', len( allocations_changeset_values)) allocations_changeset_values.perform_allocations() # Prepare anomaly metrics. anomaly_metrics = convert_anomalies_to_metrics(anomalies, tasks_labels) update_anomalies_metrics_with_task_information(anomaly_metrics, tasks_labels) # Store anomalies information anomalies_package = MetricPackage(self._anomalies_storage) anomalies_package.add_metrics( anomaly_metrics, extra_metrics, self._anomalies_statistics.get_metrics(anomalies) ) anomalies_package.send(common_labels) # Prepare allocations metrics. allocations_metrics = target_allocations_values.generate_metrics() allocations_statistic_metrics = _get_allocations_statistics_metrics( self._allocations_counter, self._allocations_errors, allocate_duration) # Store allocations metrics. allocations_package = MetricPackage(self._allocations_storage) allocations_package.add_metrics( allocations_metrics, extra_metrics, allocations_statistic_metrics, ) allocations_package.send(common_labels)
class AllocationRunner(Runner): """rst Runner is responsible for getting information about tasks from node, calling allocate() callback on allocator, performing returning allocations and storing all allocation related metrics in allocations_storage. Because Allocator interface is also detector, we store serialized detected anomalies in anomalies_storage and all other measurements in metrics_storage. - ``measurement_runner``: **MeasurementRunner** Measurement runner object. - ``allocator``: **Allocator** Component that provides allocation logic. - ``anomalies_storage``: **Storage** = `DEFAULT_STORAGE` Storage to store serialized anomalies and extra metrics. - ``allocations_storage``: **tdwiboolype** = `DEFAULT_STORAGE` Storage to store serialized resource allocations. - ``rdt_mb_control_required``: **bool** = *False* Indicates that MB control is required, if the platform does not support this feature the WCA will exit. - ``rdt_cache_control_required``: **bool** = *False* Indicates tha L3 control is required, if the platform does not support this feature the WCA will exit. - ``remove_all_resctrl_groups``: **bool** = *False* Remove all RDT controls groups upon starting. """ def __init__(self, measurement_runner: MeasurementRunner, allocator: Allocator, allocations_storage: Storage = DEFAULT_STORAGE, anomalies_storage: Storage = DEFAULT_STORAGE, rdt_mb_control_required: bool = False, rdt_cache_control_required: bool = False, remove_all_resctrl_groups: bool = False): if not measurement_runner._allocation_configuration: measurement_runner._allocation_configuration = AllocationConfiguration( ) self._measurement_runner = measurement_runner # Allocation specific. self._allocator = allocator self._allocations_storage = allocations_storage self._rdt_mb_control_required = rdt_mb_control_required self._rdt_cache_control_required = rdt_cache_control_required # Anomaly. self._anomalies_storage = anomalies_storage self._anomalies_statistics = AnomalyStatistics() # Internal allocation statistics self._allocations_counter = 0 self._allocations_errors = 0 self._remove_all_resctrl_groups = remove_all_resctrl_groups self._measurement_runner._set_iterate_body_callback(self._iterate_body) self._measurement_runner._set_initialize_rdt_callback( self._initialize_rdt) def run(self) -> int: self._measurement_runner.run() def _initialize_rdt(self) -> bool: platform, _, _ = platforms.collect_platform_information() # Cache control check. if self._rdt_cache_control_required and \ not platform.rdt_information.rdt_cache_control_enabled: # Wanted unavailable feature - halt log.error( 'RDT cache control enabled but is not supported by platform!') return False # MB control check. if self._rdt_mb_control_required and \ not platform.rdt_information.rdt_mb_control_enabled: # Some wanted unavailable feature - halt. log.error('RDT memory bandwidth enabled but ' 'allocation is not supported by platform!') return False # Prepare initial values for L3, MB... root_rdt_l3, root_rdt_mb = resctrl.get_max_rdt_values( platform.rdt_information.cbm_mask, platform.sockets, platform.rdt_information.rdt_mb_control_enabled, platform.rdt_information.rdt_cache_control_enabled) # ...override max values with values from allocation configuration if self._measurement_runner._allocation_configuration.default_rdt_l3 is not None and \ platform.rdt_information.rdt_cache_control_enabled: root_rdt_l3 = self._measurement_runner._allocation_configuration.default_rdt_l3 if self._measurement_runner._allocation_configuration.default_rdt_mb is not None and \ platform.rdt_information.rdt_mb_control_enabled: root_rdt_mb = self._measurement_runner._allocation_configuration.default_rdt_mb try: if root_rdt_l3 is not None: validate_l3_string(root_rdt_l3, platform.sockets, platform.rdt_information.cbm_mask, platform.rdt_information.min_cbm_bits) if root_rdt_mb is not None: normalized_root_rdt_mb = normalize_mb_string( root_rdt_mb, platform.sockets, platform.rdt_information.mb_min_bandwidth, platform.rdt_information.mb_bandwidth_gran) resctrl.cleanup_resctrl(root_rdt_l3, normalized_root_rdt_mb, self._remove_all_resctrl_groups) else: resctrl.cleanup_resctrl(root_rdt_l3, root_rdt_mb, self._remove_all_resctrl_groups) except InvalidAllocations as e: log.error('Cannot initialize RDT subsystem: %s', e) return False return True def _iterate_body(self, containers: Dict[Task, ContainerInterface], platform: platforms.Platform, tasks_data: TasksData, common_labels): """Allocator callback body.""" current_allocations = _get_tasks_allocations(containers) _update_tasks_data_with_allocations(tasks_data, current_allocations) # Allocator callback allocate_start = time.time() new_allocations, anomalies, extra_metrics = self._allocator.allocate( platform, tasks_data) allocate_duration = time.time() - allocate_start # Validate callback output _validate_allocate_return_vals(new_allocations, anomalies, extra_metrics) log.debug('Anomalies detected: %d', len(anomalies)) log.debug('Current allocations: %s', current_allocations) # Create context aware allocations objects for current allocations. current_allocations_values = TasksAllocationsValues.create( self._measurement_runner._rdt_enabled, current_allocations, self._measurement_runner._containers_manager.containers, platform) # Handle allocations: calculate changeset and target allocations. allocations_changeset_values = None target_allocations_values = current_allocations_values try: # Special validation step needed for Kubernetes. validate_shares_allocation_for_kubernetes( tasks=containers.keys(), allocations=new_allocations) # Create and validate context aware allocations objects for new allocations. log.debug('New allocations: %s', new_allocations) new_allocations_values = TasksAllocationsValues.create( self._measurement_runner._rdt_enabled, new_allocations, self._measurement_runner._containers_manager.containers, platform) new_allocations_values.validate() # Calculate changeset and target_allocations. target_allocations_values, allocations_changeset_values = \ new_allocations_values.calculate_changeset(current_allocations_values) target_allocations_values.validate() self._allocations_counter += len(new_allocations) except InvalidAllocations as e: # Handle any allocation validation error. # Log errors and restore current to generate proper metrics. log.error('Invalid allocations: %s', str(e)) log.warning( 'Ignoring all allocations in this iteration due to validation error!' ) self._allocations_errors += 1 target_allocations_values = current_allocations_values # Handle allocations: perform allocations based on changeset. if allocations_changeset_values: log.debug('Allocations changeset: %s', allocations_changeset_values) log.info('Performing allocations on %d tasks.', len(allocations_changeset_values)) allocations_changeset_values.perform_allocations() # Prepare anomaly metrics. anomaly_metrics = convert_anomalies_to_metrics(anomalies, tasks_data) update_anomalies_metrics_with_task_information(anomaly_metrics, tasks_data) # Store anomalies information anomalies_package = MetricPackage(self._anomalies_storage) anomalies_package.add_metrics( anomaly_metrics, extra_metrics, self._anomalies_statistics.get_metrics(anomalies)) anomalies_package.send(common_labels) # Prepare allocations metrics. allocations_metrics = target_allocations_values.generate_metrics() allocations_statistic_metrics = _get_allocations_statistics_metrics( self._allocations_counter, self._allocations_errors, allocate_duration) # Store allocations metrics. allocations_package = MetricPackage(self._allocations_storage) allocations_package.add_metrics( allocations_metrics, extra_metrics, allocations_statistic_metrics, ) allocations_package.send(common_labels)