def _soft_delete_stale_goals(self, goal_map, matching_goals): """Soft delete the stale goals :param goal_map: discovered goal map :type goal_map: :py:class:`~.GoalMapping` instance :param matching_goals: list of DB goals matching the goal_map :type matching_goals: list of :py:class:`~.objects.Goal` instances :returns: A list of soft deleted DB goals (subset of matching goals) :rtype: list of :py:class:`~.objects.Goal` instances """ goal_display_name = goal_map.display_name goal_name = goal_map.name goal_efficacy_spec = goal_map.efficacy_specification stale_goals = [] for matching_goal in matching_goals: if (matching_goal.efficacy_specification == goal_efficacy_spec and matching_goal.display_name == goal_display_name): LOG.info(_LI("Goal %s unchanged"), goal_name) else: LOG.info(_LI("Goal %s modified"), goal_name) matching_goal.soft_delete() stale_goals.append(matching_goal) return stale_goals
def execute(self): LOG.info(_LI("Starting purge command")) self._objects_map = self.find_objects_to_delete() if (self.max_number is not None and len(self._objects_map) > self.max_number): if self.delete_up_to_max_prompt(self._objects_map): self._objects_map = self._get_objects_up_to_limit() else: return _orphans_note = (_(" (orphans excluded)") if self.exclude_orphans else _(" (may include orphans)")) if not self.dry_run and self.confirmation_prompt(): self.do_delete() print(_("Purge results summary%s:") % _orphans_note) LOG.info(_LI("Purge results summary%s:"), _orphans_note) else: LOG.debug(self._objects_map) print(_("Here below is a table containing the objects " "that can be purged%s:") % _orphans_note) LOG.info("\n%s", self._objects_map.get_count_table()) print(self._objects_map.get_count_table()) LOG.info(_LI("Purge process completed"))
def sync(self): self.discovered_map = self._discover() goals_map = self.discovered_map["goals"] strategies_map = self.discovered_map["strategies"] scoringengines_map = self.discovered_map["scoringengines"] for goal_name, goal_map in goals_map.items(): if goal_map in self.available_goals_map: LOG.info(_LI("Goal %s already exists"), goal_name) continue self.goal_mapping.update(self._sync_goal(goal_map)) for strategy_name, strategy_map in strategies_map.items(): if (strategy_map in self.available_strategies_map and strategy_map.goal_name not in [g.name for g in self.goal_mapping.values()]): LOG.info(_LI("Strategy %s already exists"), strategy_name) continue self.strategy_mapping.update(self._sync_strategy(strategy_map)) for se_name, se_map in scoringengines_map.items(): if se_map in self.available_scoringengines_map: LOG.info(_LI("Scoring Engine %s already exists"), se_name) continue self.se_mapping.update(self._sync_scoringengine(se_map)) self._sync_objects() self._soft_delete_removed_scoringengines()
def main(): LOG.info(_LI('Watcher sync started.')) service.prepare_service(sys.argv, CONF) syncer = sync.Syncer() syncer.sync() LOG.info(_LI('Watcher sync finished.'))
def choose_instance_to_migrate(self, hosts): """Pick up an active instance instance to migrate from provided hosts :param hosts: the array of dict which contains node object """ instances_tobe_migrate = [] for nodemap in hosts: source_node = nodemap['node'] source_instances = self.compute_model.mapping.get_node_instances( source_node) if source_instances: inlet_t = self.ceilometer.statistic_aggregation( resource_id=source_node.uuid, meter_name=self.meter_name_inlet_t, period=self._period, aggregate='avg') power = self.ceilometer.statistic_aggregation( resource_id=source_node.uuid, meter_name=self.meter_name_power, period=self._period, aggregate='avg') if (power < self.threshold_power and inlet_t < self.threshold_inlet_t): # hardware issue, migrate all instances from this node for instance_id in source_instances: try: instance = (self.compute_model. get_instance_by_uuid(instance_id)) instances_tobe_migrate.append(instance) except wexc.InstanceNotFound: LOG.error(_LE("Instance not found; error: %s"), instance_id) return source_node, instances_tobe_migrate else: # migrate the first active instance for instance_id in source_instances: try: instance = (self.compute_model. get_instance_by_uuid(instance_id)) if (instance.state != element.InstanceState.ACTIVE.value): LOG.info( _LI("Instance not active, skipped: %s"), instance.uuid) continue instances_tobe_migrate.append(instance) return source_node, instances_tobe_migrate except wexc.InstanceNotFound: LOG.error(_LE("Instance not found; error: %s"), instance_id) else: LOG.info(_LI("Instance not found on node: %s"), source_node.uuid)
def choose_instance_to_migrate(self, hosts): """Pick up an active instance instance to migrate from provided hosts :param hosts: the array of dict which contains node object """ instances_tobe_migrate = [] for nodemap in hosts: source_node = nodemap['node'] source_instances = self.compute_model.mapping.get_node_instances( source_node) if source_instances: inlet_t = self.ceilometer.statistic_aggregation( resource_id=source_node.uuid, meter_name=self.meter_name_inlet_t, period=self._period, aggregate='avg') power = self.ceilometer.statistic_aggregation( resource_id=source_node.uuid, meter_name=self.meter_name_power, period=self._period, aggregate='avg') if (power < self.threshold_power and inlet_t < self.threshold_inlet_t): # hardware issue, migrate all instances from this node for instance_id in source_instances: try: instance = (self.compute_model. get_instance_from_id(instance_id)) instances_tobe_migrate.append(instance) except wexc.InstanceNotFound: LOG.error(_LE("Instance not found; error: %s"), instance_id) return source_node, instances_tobe_migrate else: # migrate the first active instance for instance_id in source_instances: try: instance = (self.compute_model. get_instance_from_id(instance_id)) if (instance.state != element.InstanceState.ACTIVE.value): LOG.info( _LI("Instance not active, skipped: %s"), instance.uuid) continue instances_tobe_migrate.append(instance) return source_node, instances_tobe_migrate except wexc.InstanceNotFound: LOG.error(_LE("Instance not found; error: %s"), instance_id) else: LOG.info(_LI("Instance not found on node: %s"), source_node.uuid)
def _soft_delete_stale_strategies(self, strategy_map, matching_strategies): strategy_name = strategy_map.name strategy_display_name = strategy_map.display_name stale_strategies = [] for matching_strategy in matching_strategies: if (matching_strategy.display_name == strategy_display_name and matching_strategy.goal_id not in self.goal_mapping): LOG.info(_LI("Strategy %s unchanged"), strategy_name) else: LOG.info(_LI("Strategy %s modified"), strategy_name) matching_strategy.soft_delete() stale_strategies.append(matching_strategy) return stale_strategies
def _soft_delete_stale_goals(self, goal_map, matching_goals): goal_name = goal_map.name goal_display_name = goal_map.display_name stale_goals = [] for matching_goal in matching_goals: if (matching_goal.display_name == goal_display_name and matching_goal.strategy_id not in self.strategy_mapping): LOG.info(_LI("Goal %s unchanged"), goal_name) else: LOG.info(_LI("Goal %s modified"), goal_name) matching_goal.soft_delete() stale_goals.append(matching_goal) return stale_goals
def _find_orphans(self): orphans = WatcherObjectsMap() filters = dict(deleted=False) audit_templates = objects.audit_template.AuditTemplate.list( self.ctx, filters=filters) audits = objects.audit.Audit.list(self.ctx, filters=filters) action_plans = objects.action_plan.ActionPlan.list( self.ctx, filters=filters) actions = objects.action.Action.list(self.ctx, filters=filters) audit_template_ids = set(at.id for at in audit_templates) orphans.audits = [ audit for audit in audits if audit.audit_template_id not in audit_template_ids] # Objects with orphan parents are themselves orphans audit_ids = [audit.id for audit in (a for a in audits if a not in orphans.audits)] orphans.action_plans = [ ap for ap in action_plans if ap.audit_id not in audit_ids] # Objects with orphan parents are themselves orphans action_plan_ids = [ap.id for ap in (a for a in action_plans if a not in orphans.action_plans)] orphans.actions = [ action for action in actions if action.action_plan_id not in action_plan_ids] LOG.debug("Orphans found:\n%s", orphans) LOG.info(_LI("Orphans found:\n%s"), orphans.get_count_table()) return orphans
def choose_vm_to_migrate(self, hosts, avg_workload, workload_cache): """Pick up an active vm instance to migrate from provided hosts :param hosts: the array of dict which contains hypervisor object :param avg_workload: the average workload value of all hypervisors :param workload_cache: the map contains vm to workload mapping """ for hvmap in hosts: source_hypervisor = hvmap['hv'] source_vms = self.model.get_mapping().get_node_vms( source_hypervisor) if source_vms: delta_workload = hvmap['workload'] - avg_workload min_delta = 1000000 instance_id = None for vm_id in source_vms: try: # select the first active VM to migrate vm = self.model.get_vm_from_id(vm_id) if vm.state != vm_state.VMState.ACTIVE.value: LOG.debug("VM not active; skipped: %s", vm.uuid) continue current_delta = delta_workload - workload_cache[vm_id] if 0 <= current_delta < min_delta: min_delta = current_delta instance_id = vm_id except wexc.InstanceNotFound: LOG.error(_LE("VM not found; error: %s"), vm_id) if instance_id: return source_hypervisor, self.model.get_vm_from_id( instance_id) else: LOG.info(_LI("VM not found on hypervisor: %s"), source_hypervisor.uuid)
def _sync_goal(self, goal_map): goal_name = goal_map.name goal_mapping = dict() # Goals that are matching by name with the given discovered goal name matching_goals = [g for g in self.available_goals if g.name == goal_name] stale_goals = self._soft_delete_stale_goals(goal_map, matching_goals) if stale_goals or not matching_goals: goal = objects.Goal(self.ctx) goal.name = goal_name goal.display_name = goal_map.display_name goal.efficacy_specification = [ indicator._asdict() for indicator in goal_map.efficacy_specification] goal.create() LOG.info(_LI("Goal %s created"), goal_name) # Updating the internal states self.available_goals_map[goal] = goal_map # Map the old goal IDs to the new (equivalent) goal for matching_goal in matching_goals: goal_mapping[matching_goal.id] = goal return goal_mapping
def _sync_strategy(self, strategy_map): strategy_name = strategy_map.name strategy_display_name = strategy_map.display_name goal_name = strategy_map.goal_name parameters_spec = strategy_map.parameters_spec strategy_mapping = dict() # Strategies that are matching by name with the given # discovered strategy name matching_strategies = [s for s in self.available_strategies if s.name == strategy_name] stale_strategies = self._soft_delete_stale_strategies( strategy_map, matching_strategies) if stale_strategies or not matching_strategies: strategy = objects.Strategy(self.ctx) strategy.name = strategy_name strategy.display_name = strategy_display_name strategy.goal_id = objects.Goal.get_by_name(self.ctx, goal_name).id strategy.parameters_spec = parameters_spec strategy.create() LOG.info(_LI("Strategy %s created"), strategy_name) # Updating the internal states self.available_strategies_map[strategy] = strategy_map # Map the old strategy IDs to the new (equivalent) strategy for matching_strategy in matching_strategies: strategy_mapping[matching_strategy.id] = strategy return strategy_mapping
def _sync_scoringengine(self, scoringengine_map): scoringengine_name = scoringengine_map.name se_mapping = dict() # Scoring Engines matching by id with discovered Scoring engine matching_scoringengines = [se for se in self.available_scoringengines if se.name == scoringengine_name] stale_scoringengines = self._soft_delete_stale_scoringengines( scoringengine_map, matching_scoringengines) if stale_scoringengines or not matching_scoringengines: scoringengine = objects.ScoringEngine(self.ctx) scoringengine.name = scoringengine_name scoringengine.description = scoringengine_map.description scoringengine.metainfo = scoringengine_map.metainfo scoringengine.create() LOG.info(_LI("Scoring Engine %s created"), scoringengine_name) # Updating the internal states self.available_scoringengines_map[scoringengine] = \ scoringengine_map # Map the old scoring engine names to the new (equivalent) SE for matching_scoringengine in matching_scoringengines: se_mapping[matching_scoringengine.name] = scoringengine return se_mapping
def _soft_delete_removed_scoringengines(self): removed_se = [ se for se in self.available_scoringengines if se.name not in self.discovered_map['scoringengines']] for se in removed_se: LOG.info(_LI("Scoring Engine %s removed"), se.name) se.soft_delete()
def _soft_delete_stale_scoringengines( self, scoringengine_map, matching_scoringengines): se_name = scoringengine_map.name se_description = scoringengine_map.description se_metainfo = scoringengine_map.metainfo stale_scoringengines = [] for matching_scoringengine in matching_scoringengines: if (matching_scoringengine.description == se_description and matching_scoringengine.metainfo == se_metainfo): LOG.info(_LI("Scoring Engine %s unchanged"), se_name) else: LOG.info(_LI("Scoring Engine %s modified"), se_name) matching_scoringengine.soft_delete() stale_scoringengines.append(matching_scoringengine) return stale_scoringengines
def main(): watcher_service.prepare_service(sys.argv) LOG.info(_LI('Starting Watcher Applier service in PID %s'), os.getpid()) applier_service = watcher_service.Service(manager.ApplierManager) launcher = service.launch(CONF, applier_service) launcher.wait()
def publish_status_event(self, event, payload, request_id=None): if self.status_topic_handler: return self.status_topic_handler.publish_event( event, payload, request_id) else: LOG.info( _LI("No status notifier declared: notification '%s' not sent"), event)
def pre_execute(self): """Pre-execution phase This can be used to fetch some pre-requisites or data. """ LOG.info(_LI("Initializing Workload Balance Strategy")) if not self.compute_model: raise wexc.ClusterStateNotDefined()
def main(): service.prepare_service(sys.argv, CONF) host, port = cfg.CONF.api.host, cfg.CONF.api.port protocol = "http" if not CONF.api.enable_ssl_api else "https" # Build and start the WSGI app server = service.WSGIService('watcher-api', CONF.api.enable_ssl_api) if host == '127.0.0.1': LOG.info(_LI('serving on 127.0.0.1:%(port)s, ' 'view at %(protocol)s://127.0.0.1:%(port)s') % dict(protocol=protocol, port=port)) else: LOG.info(_LI('serving on %(protocol)s://%(host)s:%(port)s') % dict(protocol=protocol, host=host, port=port)) launcher = service.launch(CONF, server, workers=server.workers) launcher.wait()
def pre_execute(self): LOG.info(_LI("Initializing Server Consolidation")) if not self.compute_model: raise exception.ClusterStateNotDefined() if len(self.compute_model.get_all_compute_nodes()) == 0: raise exception.ClusterEmpty() LOG.debug(self.compute_model.to_string())
def main(): service.prepare_service(sys.argv) host, port = cfg.CONF.api.host, cfg.CONF.api.port protocol = "http" if not CONF.api.enable_ssl_api else "https" # Build and start the WSGI app server = service.WSGIService( 'watcher-api', CONF.api.enable_ssl_api) if host == '127.0.0.1': LOG.info(_LI('serving on 127.0.0.1:%(port)s, ' 'view at %(protocol)s://127.0.0.1:%(port)s') % dict(protocol=protocol, port=port)) else: LOG.info(_LI('serving on %(protocol)s://%(host)s:%(port)s') % dict(protocol=protocol, host=host, port=port)) launcher = service.launch(CONF, server, workers=server.workers) launcher.wait()
def choose_instance_to_migrate(self, hosts): """Pick up an active instance to migrate from provided hosts""" for instance_data in hosts: mig_source_node = instance_data['node'] instances_of_src = self.compute_model.get_node_instances( mig_source_node) for instance in instances_of_src: try: # select the first active instance to migrate if (instance.state != element.InstanceState.ACTIVE.value): LOG.info(_LI("Instance not active, skipped: %s"), instance.uuid) continue return mig_source_node, instance except wexc.InstanceNotFound as e: LOG.exception(e) LOG.info(_LI("Instance not found")) return None
def info(self, ctxt, publisher_id, event_type, payload, metadata): LOG.info( _LI("Event '%(event)s' received from %(publisher)s " "with metadata %(metadata)s") % dict(event=event_type, publisher=publisher_id, metadata=metadata)) instance_uuid = payload['instance_id'] instance = self.get_or_create_instance(instance_uuid) self.legacy_update_instance(instance, payload)
def main(): watcher_service.prepare_service(sys.argv) LOG.info(_LI('Starting Watcher Applier service in PID %s'), os.getpid()) applier_service = watcher_service.Service(manager.ApplierManager) # Only 1 process launcher = watcher_service.launch(CONF, applier_service) launcher.wait()
def _find_orphans(self): orphans = WatcherObjectsMap() filters = dict(deleted=False) goals = objects.Goal.list(self.ctx, filters=filters) strategies = objects.Strategy.list(self.ctx, filters=filters) audit_templates = objects.AuditTemplate.list(self.ctx, filters=filters) audits = objects.Audit.list(self.ctx, filters=filters) action_plans = objects.ActionPlan.list(self.ctx, filters=filters) actions = objects.Action.list(self.ctx, filters=filters) goal_ids = set(g.id for g in goals) orphans.strategies = [ strategy for strategy in strategies if strategy.goal_id not in goal_ids ] strategy_ids = [ s.id for s in (s for s in strategies if s not in orphans.strategies) ] orphans.audit_templates = [ audit_template for audit_template in audit_templates if audit_template.goal_id not in goal_ids or ( audit_template.strategy_id and audit_template.strategy_id not in strategy_ids) ] orphans.audits = [ audit for audit in audits if audit.goal_id not in goal_ids or ( audit.strategy_id and audit.strategy_id not in strategy_ids) ] # Objects with orphan parents are themselves orphans audit_ids = [ audit.id for audit in audits if audit not in orphans.audits ] orphans.action_plans = [ ap for ap in action_plans if ap.audit_id not in audit_ids or ap.strategy_id not in strategy_ids ] # Objects with orphan parents are themselves orphans action_plan_ids = [ ap.id for ap in action_plans if ap not in orphans.action_plans ] orphans.actions = [ action for action in actions if action.action_plan_id not in action_plan_ids ] LOG.debug("Orphans found:\n%s", orphans) LOG.info(_LI("Orphans found:\n%s"), orphans.get_count_table()) return orphans
def pre_execute(self): """Pre-execution phase This can be used to fetch some pre-requisites or data. """ LOG.info(_LI("Initializing Workload Balance Strategy")) if not self.compute_model: raise wexc.ClusterStateNotDefined() LOG.debug(self.compute_model.to_string())
def pre_execute(self): LOG.info(_LI("Initializing Workload Stabilization")) if not self.compute_model: raise exception.ClusterStateNotDefined() self.weights = self.input_parameters.weights self.metrics = self.input_parameters.metrics self.thresholds = self.input_parameters.thresholds self.host_choice = self.input_parameters.host_choice self.instance_metrics = self.input_parameters.instance_metrics self.retry_count = self.input_parameters.retry_count
def info(self, ctxt, publisher_id, event_type, payload, metadata): LOG.info( _LI("Event '%(event)s' received from %(publisher)s " "with metadata %(metadata)s") % dict(event=event_type, publisher=publisher_id, metadata=metadata)) node_data = payload['nova_object.data'] node_uuid = node_data['host'] try: node = self.get_or_create_node(node_uuid) self.update_compute_node(node, payload) except exception.ComputeNodeNotFound as exc: LOG.exception(exc)
def main(): watcher_service.prepare_service(sys.argv) LOG.info(_LI('Starting Watcher Decision Engine service in PID %s'), os.getpid()) syncer = sync.Syncer() syncer.sync() de_service = watcher_service.Service(manager.DecisionEngineManager) launcher = service.launch(CONF, de_service) launcher.wait()
def info(self, ctxt, publisher_id, event_type, payload, metadata): LOG.info( _LI("Event '%(event)s' received from %(publisher)s " "with metadata %(metadata)s") % dict(event=event_type, publisher=publisher_id, metadata=metadata)) LOG.debug(payload) instance_data = payload['nova_object.data'] instance_uuid = instance_data['uuid'] node_uuid = instance_data.get('host') instance = self.get_or_create_instance(instance_uuid, node_uuid) self.update_instance(instance, payload)
def choose_vm_to_migrate(self, hosts): """Pick up an active vm instance to migrate from provided hosts""" for hvmap in hosts: mig_src_hypervisor = hvmap['hv'] vms_of_src = self.model.get_mapping().get_node_vms( mig_src_hypervisor) if len(vms_of_src) > 0: for vm_id in vms_of_src: try: # select the first active VM to migrate vm = self.model.get_vm_from_id(vm_id) if vm.state != vm_state.VMState.ACTIVE.value: LOG.info(_LI("VM not active, skipped: %s"), vm.uuid) continue return mig_src_hypervisor, vm except wexc.InstanceNotFound as e: LOG.exception(e) LOG.info(_LI("VM not found")) return None
def do_execute(self): # the migration plan will be triggered when the outlet temperature # reaches threshold self.threshold = self.input_parameters.threshold LOG.debug("Initializing Outlet temperature strategy with threshold=%d", self.threshold) hosts_need_release, hosts_target = self.group_hosts_by_outlet_temp() if len(hosts_need_release) == 0: # TODO(zhenzanz): return something right if there's no hot servers LOG.debug("No hosts require optimization") return self.solution if len(hosts_target) == 0: LOG.warning(_LW("No hosts under outlet temp threshold found")) return self.solution # choose the server with highest outlet t hosts_need_release = sorted(hosts_need_release, reverse=True, key=lambda x: (x["outlet_temp"])) instance_to_migrate = self.choose_instance_to_migrate( hosts_need_release) # calculate the instance's cpu cores,memory,disk needs if instance_to_migrate is None: return self.solution mig_source_node, instance_src = instance_to_migrate dest_servers = self.filter_dest_servers(hosts_target, instance_src) # sort the filtered result by outlet temp # pick up the lowest one as dest server if len(dest_servers) == 0: # TODO(zhenzanz): maybe to warn that there's no resource # for instance. LOG.info(_LI("No proper target host could be found")) return self.solution dest_servers = sorted(dest_servers, key=lambda x: (x["outlet_temp"])) # always use the host with lowerest outlet temperature mig_destination_node = dest_servers[0]['node'] # generate solution to migrate the instance to the dest server, if self.compute_model.migrate_instance(instance_src, mig_source_node, mig_destination_node): parameters = { 'migration_type': 'live', 'source_node': mig_source_node.uuid, 'destination_node': mig_destination_node.uuid } self.solution.add_action(action_type=self.MIGRATION, resource_id=instance_src.uuid, input_parameters=parameters)
def purge(age_in_days, max_number, goal, exclude_orphans, dry_run): """Removes soft deleted objects from the database :param age_in_days: Number of days since deletion (from today) to exclude from the purge. If None, everything will be purged. :type age_in_days: int :param max_number: Max number of objects expected to be deleted. Prevents the deletion if exceeded. No limit if set to None. :type max_number: int :param goal: UUID or name of the goal to purge. :type goal: str :param exclude_orphans: Flag to indicate whether or not you want to exclude orphans from deletion (default: False). :type exclude_orphans: bool :param dry_run: Flag to indicate whether or not you want to perform a dry run (no deletion). :type dry_run: bool """ try: if max_number and max_number < 0: raise exception.NegativeLimitError LOG.info(_LI("[options] age_in_days = %s"), age_in_days) LOG.info(_LI("[options] max_number = %s"), max_number) LOG.info(_LI("[options] goal = %s"), goal) LOG.info(_LI("[options] exclude_orphans = %s"), exclude_orphans) LOG.info(_LI("[options] dry_run = %s"), dry_run) uuid = PurgeCommand.get_goal_uuid(goal) cmd = PurgeCommand(age_in_days, max_number, uuid, exclude_orphans, dry_run) cmd.execute() except Exception as exc: LOG.exception(exc) print(exc) sys.exit(1)
def sync(self): self.discovered_map = self._discover() goals_map = self.discovered_map["goals"] strategies_map = self.discovered_map["strategies"] for goal_name, goal_map in goals_map.items(): if goal_map in self.available_goals_map: LOG.info(_LI("Goal %s already exists"), goal_name) continue self.goal_mapping.update(self._sync_goal(goal_map)) for strategy_name, strategy_map in strategies_map.items(): if (strategy_map in self.available_strategies_map and strategy_map.goal_name not in [g.name for g in self.goal_mapping.values()]): LOG.info(_LI("Strategy %s already exists"), strategy_name) continue self.strategy_mapping.update(self._sync_strategy(strategy_map)) self._sync_audit_templates()
def _sync_objects(self): # First we find audit templates, audits and action plans that are stale # because their associated goal or strategy has been modified and we # update them in-memory self._find_stale_audit_templates_due_to_goal() self._find_stale_audit_templates_due_to_strategy() self._find_stale_audits_due_to_goal() self._find_stale_audits_due_to_strategy() self._find_stale_action_plans_due_to_strategy() self._find_stale_action_plans_due_to_audit() # Then we handle the case where an audit template, an audit or an # action plan becomes stale because its related goal does not # exist anymore. self._soft_delete_removed_goals() # Then we handle the case where an audit template, an audit or an # action plan becomes stale because its related strategy does not # exist anymore. self._soft_delete_removed_strategies() # Finally, we save into the DB the updated stale audit templates # and soft delete stale audits and action plans for stale_audit_template in self.stale_audit_templates_map.values(): stale_audit_template.save() LOG.info(_LI("Audit Template '%s' synced"), stale_audit_template.name) for stale_audit in self.stale_audits_map.values(): stale_audit.save() LOG.info(_LI("Stale audit '%s' synced and cancelled"), stale_audit.uuid) for stale_action_plan in self.stale_action_plans_map.values(): stale_action_plan.save() LOG.info(_LI("Stale action plan '%s' synced and cancelled"), stale_action_plan.uuid)
def choose_instance_to_migrate(self, hosts): """Pick up an active instance to migrate from provided hosts""" for instance_data in hosts: mig_source_node = instance_data['node'] instances_of_src = self.compute_model.mapping.get_node_instances( mig_source_node) if len(instances_of_src) > 0: for instance_id in instances_of_src: try: # select the first active instance to migrate instance = self.compute_model.get_instance_from_id( instance_id) if (instance.state != element.InstanceState.ACTIVE.value): LOG.info(_LI("Instance not active, skipped: %s"), instance.uuid) continue return mig_source_node, instance except wexc.InstanceNotFound as e: LOG.exception(e) LOG.info(_LI("Instance not found")) return None
def do_execute(self): # the migration plan will be triggered when the outlet temperature # reaches threshold self.threshold = self.input_parameters.threshold LOG.debug("Initializing Outlet temperature strategy with threshold=%d", self.threshold) hosts_need_release, hosts_target = self.group_hosts_by_outlet_temp() if len(hosts_need_release) == 0: # TODO(zhenzanz): return something right if there's no hot servers LOG.debug("No hosts require optimization") return self.solution if len(hosts_target) == 0: LOG.warning(_LW("No hosts under outlet temp threshold found")) return self.solution # choose the server with highest outlet t hosts_need_release = sorted(hosts_need_release, reverse=True, key=lambda x: (x["outlet_temp"])) instance_to_migrate = self.choose_instance_to_migrate( hosts_need_release) # calculate the instance's cpu cores,memory,disk needs if instance_to_migrate is None: return self.solution mig_source_node, instance_src = instance_to_migrate dest_servers = self.filter_dest_servers(hosts_target, instance_src) # sort the filtered result by outlet temp # pick up the lowest one as dest server if len(dest_servers) == 0: # TODO(zhenzanz): maybe to warn that there's no resource # for instance. LOG.info(_LI("No proper target host could be found")) return self.solution dest_servers = sorted(dest_servers, key=lambda x: (x["outlet_temp"])) # always use the host with lowerest outlet temperature mig_destination_node = dest_servers[0]['node'] # generate solution to migrate the instance to the dest server, if self.compute_model.mapping.migrate_instance( instance_src, mig_source_node, mig_destination_node): parameters = {'migration_type': 'live', 'source_node': mig_source_node.uuid, 'destination_node': mig_destination_node.uuid} self.solution.add_action(action_type=self.MIGRATION, resource_id=instance_src.uuid, input_parameters=parameters)
def info(self, ctxt, publisher_id, event_type, payload, metadata): ctxt.request_id = metadata['message_id'] ctxt.project_domain = event_type LOG.info( _LI("Event '%(event)s' received from %(publisher)s " "with metadata %(metadata)s") % dict(event=event_type, publisher=publisher_id, metadata=metadata)) LOG.debug(payload) instance_uuid = payload['instance_id'] node_uuid = payload.get('node') instance = self.get_or_create_instance(instance_uuid, node_uuid) self.legacy_update_instance(instance, payload)
def _soft_delete_removed_strategies(self): removed_strategies = [ s for s in self.available_strategies if s.name not in self.discovered_map['strategies']] for removed_strategy in removed_strategies: removed_strategy.soft_delete() filters = {"strategy_id": removed_strategy.id} invalid_ats = objects.AuditTemplate.list(self.ctx, filters=filters) for at in invalid_ats: LOG.info( _LI("Audit Template '%(audit_template)s' references a " "strategy that does not exist"), audit_template=at.uuid) # In this case we can reset the strategy ID to None # so the audit template can still achieve the same goal # but with a different strategy if at.id not in self.stale_audit_templates_map: at.strategy_id = None self.stale_audit_templates_map[at.id] = at else: self.stale_audit_templates_map[at.id].strategy_id = None stale_audits = objects.Audit.list( self.ctx, filters=filters, eager=True) for audit in stale_audits: LOG.warning( _LW("Audit '%(audit)s' references a " "strategy that does not exist"), audit=audit.uuid) if audit.id not in self.stale_audits_map: audit.state = objects.audit.State.CANCELLED self.stale_audits_map[audit.id] = audit else: self.stale_audits_map[ audit.id].state = objects.audit.State.CANCELLED stale_action_plans = objects.ActionPlan.list( self.ctx, filters=filters, eager=True) for action_plan in stale_action_plans: LOG.warning( _LW("Action Plan '%(action_plan)s' references a " "strategy that does not exist"), action_plan=action_plan.uuid) if action_plan.id not in self.stale_action_plans_map: action_plan.state = objects.action_plan.State.CANCELLED self.stale_action_plans_map[action_plan.id] = action_plan else: self.stale_action_plans_map[ action_plan.id].state = ( objects.action_plan.State.CANCELLED)
def _find_orphans(self): orphans = WatcherObjectsMap() filters = dict(deleted=False) goals = objects.Goal.list(self.ctx, filters=filters) strategies = objects.Strategy.list(self.ctx, filters=filters) audit_templates = objects.AuditTemplate.list(self.ctx, filters=filters) audits = objects.Audit.list(self.ctx, filters=filters) action_plans = objects.ActionPlan.list(self.ctx, filters=filters) actions = objects.Action.list(self.ctx, filters=filters) goal_ids = set(g.id for g in goals) orphans.strategies = [ strategy for strategy in strategies if strategy.goal_id not in goal_ids] strategy_ids = [s.id for s in (s for s in strategies if s not in orphans.strategies)] orphans.audit_templates = [ audit_template for audit_template in audit_templates if audit_template.goal_id not in goal_ids or (audit_template.strategy_id and audit_template.strategy_id not in strategy_ids)] orphans.audits = [ audit for audit in audits if audit.goal_id not in goal_ids or (audit.strategy_id and audit.strategy_id not in strategy_ids)] # Objects with orphan parents are themselves orphans audit_ids = [audit.id for audit in audits if audit not in orphans.audits] orphans.action_plans = [ ap for ap in action_plans if ap.audit_id not in audit_ids] # Objects with orphan parents are themselves orphans action_plan_ids = [ap.id for ap in action_plans if ap not in orphans.action_plans] orphans.actions = [ action for action in actions if action.action_plan_id not in action_plan_ids] LOG.debug("Orphans found:\n%s", orphans) LOG.info(_LI("Orphans found:\n%s"), orphans.get_count_table()) return orphans
def execute(self, original_model): """Execute strategy. This strategy produces a solution resulting in more efficient utilization of cluster resources using following four phases: * Offload phase - handling over-utilized resources * Consolidation phase - handling under-utilized resources * Solution optimization - reducing number of migrations * Deactivation of unused hypervisors :param original_model: root_model object """ LOG.info(_LI('Executing Smart Strategy')) model = self.get_prediction_model(original_model) rcu = self.get_relative_cluster_utilization(model) self.ceilometer_vm_data_cache = dict() cc = {'cpu': 1.0, 'ram': 1.0, 'disk': 1.0} # Offloading phase self.offload_phase(model, cc) # Consolidation phase self.consolidation_phase(model, cc) # Optimize solution self.optimize_solution(model) # Deactivate unused hypervisors self.deactivate_unused_hypervisors(model) rcu_after = self.get_relative_cluster_utilization(model) info = { 'number_of_migrations': self.number_of_migrations, 'number_of_released_hypervisors': self.number_of_released_hypervisors, 'relative_cluster_utilization_before': str(rcu), 'relative_cluster_utilization_after': str(rcu_after) } LOG.debug(info) self.solution.model = model self.solution.efficacy = rcu_after['cpu'] return self.solution
def info(self, ctxt, publisher_id, event_type, payload, metadata): LOG.info( _LI("Event '%(event)s' received from %(publisher)s " "with metadata %(metadata)s") % dict(event=event_type, publisher=publisher_id, metadata=metadata)) instance_uuid = payload['instance_id'] instance = self.get_or_create_instance(instance_uuid) try: node = self.get_or_create_node(payload['host']) except exception.ComputeNodeNotFound as exc: LOG.exception(exc) # If we can't create the node, we consider the instance as unmapped node = None self.delete_instance(instance, node)
def choose_vm_to_migrate(self, hosts): """pick up an active vm instance to migrate from provided hosts :param hosts: the array of dict which contains hypervisor object """ vms_tobe_migrate = [] for hvmap in hosts: source_hypervisor = hvmap['hv'] source_vms = self.model.get_mapping().get_node_vms( source_hypervisor) if source_vms: inlet_t = self.ceilometer.statistic_aggregation( resource_id=source_hypervisor.uuid, meter_name=self.meter_name_inlet_t, period=self._period, aggregate='avg') power = self.ceilometer.statistic_aggregation( resource_id=source_hypervisor.uuid, meter_name=self.meter_name_power, period=self._period, aggregate='avg') if (power < self.threshold_power and inlet_t < self.threshold_inlet_t): # hardware issue, migrate all vms from this hypervisor for vm_id in source_vms: try: vm = self.model.get_vm_from_id(vm_id) vms_tobe_migrate.append(vm) except wexc.InstanceNotFound: LOG.error(_LE("VM not found Error: %s"), vm_id) return source_hypervisor, vms_tobe_migrate else: # migrate the first active vm for vm_id in source_vms: try: vm = self.model.get_vm_from_id(vm_id) if vm.state != vm_state.VMState.ACTIVE.value: LOG.info(_LE("VM not active, skipped: %s"), vm.uuid) continue vms_tobe_migrate.append(vm) return source_hypervisor, vms_tobe_migrate except wexc.InstanceNotFound: LOG.error(_LE("VM not found Error: %s"), vm_id) else: LOG.info(_LI("VM not found from hypervisor: %s"), source_hypervisor.uuid)
def main(): watcher_service.prepare_service(sys.argv) LOG.info(_LI('Starting Watcher Decision Engine service in PID %s'), os.getpid()) syncer = sync.Syncer() syncer.sync() de_service = watcher_service.Service(manager.DecisionEngineManager) bg_schedulder_service = scheduling.DecisionEngineSchedulingService() # Only 1 process launcher = watcher_service.launch(CONF, de_service) launcher.launch_service(bg_schedulder_service) launcher.wait()
def stop(self): try: self.rpcserver.stop() self.rpcserver.wait() except Exception as e: LOG.exception(_LE('Service error occurred when stopping the ' 'RPC server. Error: %s'), e) try: self.manager.del_host(deregister=self.deregister) except Exception as e: LOG.exception(_LE('Service error occurred when cleaning up ' 'the RPC manager. Error: %s'), e) super(RPCService, self).stop(graceful=True) LOG.info(_LI('Stopped RPC server for service %(service)s on host ' '%(host)s.'), {'service': self.topic, 'host': self.host})
def _sync_audit_templates(self): # First we find audit templates that are stale because their associated # goal or strategy has been modified and we update them in-memory self._find_stale_audit_templates_due_to_goal() self._find_stale_audit_templates_due_to_strategy() # Then we handle the case where an audit template became # stale because its related goal does not exist anymore. self._soft_delete_removed_goals() # Then we handle the case where an audit template became # stale because its related strategy does not exist anymore. self._soft_delete_removed_strategies() # Finally, we save into the DB the updated stale audit templates for stale_audit_template in self.stale_audit_templates_map.values(): stale_audit_template.save() LOG.info(_LI("Audit Template '%s' synced"), stale_audit_template.name)
def do_execute(self): """Execute strategy. This strategy produces a solution resulting in more efficient utilization of cluster resources using following four phases: * Offload phase - handling over-utilized resources * Consolidation phase - handling under-utilized resources * Solution optimization - reducing number of migrations * Disability of unused nodes :param original_model: root_model object """ LOG.info(_LI('Executing Smart Strategy')) model = self.compute_model.get_latest_cluster_data_model() rcu = self.get_relative_cluster_utilization(model) self.ceilometer_vm_data_cache = dict() cc = {'cpu': 1.0, 'ram': 1.0, 'disk': 1.0} # Offloading phase self.offload_phase(model, cc) # Consolidation phase self.consolidation_phase(model, cc) # Optimize solution self.optimize_solution(model) # disable unused nodes self.disable_unused_nodes(model) rcu_after = self.get_relative_cluster_utilization(model) info = { 'number_of_migrations': self.number_of_migrations, 'number_of_released_nodes': self.number_of_released_nodes, 'relative_cluster_utilization_before': str(rcu), 'relative_cluster_utilization_after': str(rcu_after) } LOG.debug(info)
def start(self): super(RPCService, self).start() admin_context = context.RequestContext('admin', 'admin', is_admin=True) target = messaging.Target(topic=self.topic, server=self.host) endpoints = [self.manager] serializer = objects_base.IronicObjectSerializer() self.rpcserver = rpc.get_server(target, endpoints, serializer) self.rpcserver.start() self.handle_signal() self.manager.init_host() self.tg.add_dynamic_timer( self.manager.periodic_tasks, periodic_interval_max=cfg.CONF.periodic_interval, context=admin_context) LOG.info(_LI('Created RPC server for service %(service)s on host ' '%(host)s.'), {'service': self.topic, 'host': self.host})
def choose_instance_to_migrate(self, hosts, avg_workload, workload_cache): """Pick up an active instance instance to migrate from provided hosts :param hosts: the array of dict which contains node object :param avg_workload: the average workload value of all nodes :param workload_cache: the map contains instance to workload mapping """ for instance_data in hosts: source_node = instance_data['node'] source_instances = self.compute_model.mapping.get_node_instances( source_node) if source_instances: delta_workload = instance_data['workload'] - avg_workload min_delta = 1000000 instance_id = None for inst_id in source_instances: try: # select the first active VM to migrate instance = self.compute_model.get_instance_from_id( inst_id) if (instance.state != element.InstanceState.ACTIVE.value): LOG.debug("Instance not active, skipped: %s", instance.uuid) continue current_delta = ( delta_workload - workload_cache[inst_id]) if 0 <= current_delta < min_delta: min_delta = current_delta instance_id = inst_id except wexc.InstanceNotFound: LOG.error(_LE("Instance not found; error: %s"), instance_id) if instance_id: return (source_node, self.compute_model.get_instance_from_id( instance_id)) else: LOG.info(_LI("VM not found from node: %s"), source_node.uuid)
def execute(self, orign_model): LOG.info(_LI("Initializing Workload Stabilization")) current_model = orign_model if orign_model is None: raise exception.ClusterStateNotDefined() migration = self.check_threshold(current_model) if migration: hosts_load = self.get_hosts_load(current_model) min_sd = 1 balanced = False for vm_host in migration: dst_hp_disk = current_model.get_resource_from_id( resource.ResourceType.disk).get_capacity( current_model.get_hypervisor_from_id(vm_host['host'])) vm_disk = current_model.get_resource_from_id( resource.ResourceType.disk).get_capacity( current_model.get_vm_from_id(vm_host['vm'])) if vm_disk > dst_hp_disk: continue vm_load = self.calculate_migration_case(hosts_load, vm_host['vm'], vm_host['s_host'], vm_host['host'], current_model) weighted_sd = self.calculate_weighted_sd(vm_load[:-1]) if weighted_sd < min_sd: min_sd = weighted_sd hosts_load = vm_load[-1] self.migrate(current_model, vm_host['vm'], vm_host['s_host'], vm_host['host']) for metric, value in zip(self.metrics, vm_load[:-1]): if value < float(self.thresholds[metric]): balanced = True break if balanced: break return self.fill_solution(current_model)
def _soft_delete_removed_strategies(self): removed_strategies = [ s for s in self.available_strategies if s.name not in self.discovered_map['strategies']] for removed_strategy in removed_strategies: removed_strategy.soft_delete() filters = {"strategy_id": removed_strategy.id} invalid_ats = objects.AuditTemplate.list(self.ctx, filters=filters) for at in invalid_ats: LOG.info( _LI("Audit Template '%(audit_template)s' references a " "strategy that does not exist"), audit_template=at.uuid) # In this case we can reset the strategy ID to None # so the audit template can still achieve the same goal # but with a different strategy if at.id not in self.stale_audit_templates_map: at.strategy_id = None self.stale_audit_templates_map[at.id] = at else: self.stale_audit_templates_map[at.id].strategy_id = None
def get_vm_load(self, vm_uuid): """Gathering vm load through ceilometer statistic. :param vm_uuid: vm for which statistic is gathered. :return: dict """ LOG.debug(_LI('get_vm_load started')) vm_vcpus = self.model.get_resource_from_id( resource.ResourceType.cpu_cores).get_capacity( self.model.get_vm_from_id(vm_uuid)) vm_load = {'uuid': vm_uuid, 'vcpus': vm_vcpus} for meter in self.metrics: avg_meter = self.ceilometer.statistic_aggregation( resource_id=vm_uuid, meter_name=meter, period="120", aggregate='min' ) if avg_meter is None: raise exception.NoMetricValuesForVM(resource_id=vm_uuid, metric_name=meter) vm_load[meter] = avg_meter return vm_load