def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) self.COMPUTE_NODES = [ 'acticloud1', 'acticloud2', 'acticloud3', 'acticloud4' ] self.acticlouddb_client = ActiCloudDBClient() self.openstack_client = OpenstackClient()
def __init__(self, poll_period): self.logger = logging.getLogger(self.__class__.__name__) ## The queue where messages from internals are stored self.messages = [] self.openstack_client = OpenstackClient() self.information_aggregator = InformationAggregator() self.poll_period = poll_period
def __init__(self, hostname): self.logger = logging.getLogger(self.__class__.__name__) self.hostname = hostname ## Initialize the acticloudDB client self.acticloudDB_client = ActiCloudDBClient() ## Openstack client to get the necessary information when needed self.openstack_client = OpenstackClient() ## VMs that are being monitored, indexed by uuid self.monitored_vms = dict() ## Spawned perf threads self.perf_threads = []
class InformationAggregator(): def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) self.COMPUTE_NODES = [ 'acticloud1', 'acticloud2', 'acticloud3', 'acticloud4' ] self.acticlouddb_client = ActiCloudDBClient() self.openstack_client = OpenstackClient() def is_gold_vm(self, vmid): return self.acticlouddb_client.is_gold_vm(vmid, 1) def get_vm_nr_vcpus(self, vmid): return self.acticlouddb_client.get_nr_vcpus(vmid) def get_vm_current_host(self, vmid): return self.openstack_client.get_vm_current_host(vmid) def getServerListByComputeNode(self, computeNode): return self.openstack_client.get_vms_by_hostname(computeNode) def getGoldServerListByComputeNode(self, computeNode): all_servers = self.getServerListByComputeNode(computeNode) gold_servers = [x for x in all_servers if self.is_gold_vm(x)] return gold_servers def getSilverServerListByComputeNode(self, computeNode): all_servers = self.getServerListByComputeNode(computeNode) silver_servers = [x for x in all_servers if not self.is_gold_vm(x)] return silver_servers def getVcpusByComputeNode(self, computeNode): gold_vcpus = self.acticlouddb_client.get_nr_gold_vcpus_by_hostname( computeNode) silver_vcpus = self.acticlouddb_client.get_nr_silver_vcpus_by_hostname( computeNode) return (gold_vcpus, silver_vcpus) def getServerList(self): return self.openstack_client.get_vms() def getComputeNodes(self): return list(self.COMPUTE_NODES)
def __init__(self, compute_node_name, system_type): global INTERFERENCE_DETECTION_ENABLED self.logger = logging.getLogger(self.__class__.__name__) self.hostname = compute_node_name self.compute_node_name = compute_node_name self.system_type = system_type if system_type == "actistatic" or system_type == "actifull": if system_type == "actifull": INTERFERENCE_DETECTION_ENABLED = True self.system = ActiManagerSystem([2, 1, 10], compute_node_name) elif system_type == "gps": self.system = GoldPerSocketSystem([2, 1, 10], compute_node_name) elif system_type == "gno": self.system = GoldNotOversubscribedSystem([2, 1, 10], compute_node_name) else: self.logger.error("Wrong system_type given: %s", system_type) sys.exit(1) self.acticlouddb_client = ActiCloudDBClient() self.openstack_client = OpenstackClient() return
class InformationAggregator(): def __init__(self, compute_node_name, system_type): global INTERFERENCE_DETECTION_ENABLED self.logger = logging.getLogger(self.__class__.__name__) self.hostname = compute_node_name self.compute_node_name = compute_node_name self.system_type = system_type if system_type == "actistatic" or system_type == "actifull": if system_type == "actifull": INTERFERENCE_DETECTION_ENABLED = True self.system = ActiManagerSystem([2, 1, 10], compute_node_name) elif system_type == "gps": self.system = GoldPerSocketSystem([2, 1, 10], compute_node_name) elif system_type == "gno": self.system = GoldNotOversubscribedSystem([2, 1, 10], compute_node_name) else: self.logger.error("Wrong system_type given: %s", system_type) sys.exit(1) self.acticlouddb_client = ActiCloudDBClient() self.openstack_client = OpenstackClient() return def get_system(self): return self.system def report_events(self): if ("acti" in self.system_type): event_logger.log_event({ 'event': 'internal-profit-report', 'profit-value': self.system.cur_profit, 'hostname': self.hostname }) esd_dict = dict() for vm in self.system.vms: esd_dict[vm] = [v.esd for v in self.system.vms[vm].vcpus] event_logger.log_event({ 'event': 'internal-esd-report', 'values': esd_dict, 'hostname': self.hostname }) def get_pid_from_vm_id(self, vm): command = "nova show %s | grep -i 'OS-EXT-SRV-ATTR:instance_name' | awk -F'|' '{print $3}'" % str( vm.id) libvirt_instance_name = shell_command.run_shell_command( command).strip() command = "ssh %s ps -ef | grep %s | grep \"qemu-system\" | grep -v grep | awk '{print $2}'" % ( self.hostname, libvirt_instance_name) pid = shell_command.run_shell_command(command).strip() return pid def get_dst_numa_node_from_pcpu(self, pcpu_id): #module numa has not implemented numa_node_of_cpu() call of numa(3) library for i in range(0, numa.get_max_node() + 1): if pcpu_id in numa.node_to_cpus(i): return i def print_vcpu_to_pcpu_mapping(self): p_mapping = [] for vm in self.getServerListByComputeNode(self.compute_node_name): vm_mapping = self.getVMVcpuMapping(vm) for i in range(len(vm_mapping)): p_mapping.append(0) break for vm in self.getServerListByComputeNode(self.compute_node_name): vm_mapping = self.getVMVcpuMapping(vm) for i in range(len(vm_mapping)): if vm_mapping[i]: if self.is_gold_vm(vm): p_mapping[i] += 10 else: p_mapping[i] += 1 self.logger.info("Physical CPU mapping: %s" % p_mapping) return def is_noisy_vm(self, vm): is_noisy = self.acticlouddb_client.is_noisy_vm(vm.id) return is_noisy def is_sensitive_vm(self, vm): is_sensitive = self.acticlouddb_client.is_sensitive_vm(vm.id) return is_sensitive def is_gold_vm(self, vm): is_gold = self.acticlouddb_client.is_gold_vm(vm.id, 1) return is_gold def get_vm_nr_vcpus(self, vm): return self.acticlouddb_client.get_nr_vcpus(vm.id) def get_cost_function(self, vm): cost_function = self.acticlouddb_client.cost_function(vm.id) return cost_function def get_cpu_util(self, vm): return 1.0 # FIXME cpu_util = self.acticlouddb_client.cpu_util(vm.id) return cpu_util def update_moves(self, vm): # vm: Vm class instance prev_moves = self.acticlouddb_client.get_moves(vm.id) self.acticlouddb_client.set_moves(vm.id, prev_moves + vm.moves) def getServerListByComputeNode(self, computeNode): return self.openstack_client.get_vms_by_hostname(computeNode) def getServerList(self): return self.openstack_client.get_vms() def getVMVcpuMapping(self, vm): try: libvirt_instance_name = getattr(vm, 'OS-EXT-SRV-ATTR:instance_name') with libvirt_client.LibVirtConnection(self.hostname, "qemu+ssh") as libvconn: libvinstance = libvirt_client.LibVirtInstance( libvconn, str(libvirt_instance_name)) return list(libvinstance.get_instance_mapping()[0]) except: self.logger.info( "=================> Could not get vcpu mapping of VM %s", vm.id) return None
from benchmarks import * sys.path.append('../common/') from openstack_client import OpenstackClient import event_logger ## Setup the logging facility logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(name)s - %(message)s', datefmt='%Y-%m-%d.%H:%M:%S') logging.Formatter.converter = time.gmtime logger = logging.getLogger("executor") ## Inititalize Openstack Client ost_client = OpenstackClient() def get_image_by_bench(bench): img_name = bench['openstack_image'] for img in ost_client.get_images(): if img_name in img.name: return img def spawn_vm(seq_num, vm_chars, wait_until_finished): ''' Spawns a VM and returns 0 if an error has occured ''' vcpus = vm_chars['nr_vcpus'] is_gold = vm_chars['is_gold']
class ACTiManagerExternal(): def __init__(self, poll_period): self.logger = logging.getLogger(self.__class__.__name__) ## The queue where messages from internals are stored self.messages = [] self.openstack_client = OpenstackClient() self.information_aggregator = InformationAggregator() self.poll_period = poll_period def start_rabbitmq_thread(self): self.logger.info("Starting rabbitmq consumer thread") self.rabbitmq_thread_done = 0 self.rabbitmq_thread = threading.Thread( target=self.rabbitmq_consumer_thread) self.rabbitmq_thread.start() def stop_rabbitmq_thread(self): self.logger.info("Stopping rabbitmq consumer thread") self.rabbitmq_thread_done = 1 self.rabbitmq_thread.join() def rabbitmq_consumer_thread(self): pika_creds = pika.credentials.PlainCredentials(RABBITMQ_USERNAME, RABBITMQ_PASSWORD) pika_cp = pika.ConnectionParameters(RABBITMQ_IP, credentials=pika_creds) connection = pika.BlockingConnection(pika_cp) channel = connection.channel() channel.queue_declare(queue=RABBITMQ_ACTI_QUEUE_NAME) for message in channel.consume(RABBITMQ_ACTI_QUEUE_NAME, inactivity_timeout=5, auto_ack=True): if self.rabbitmq_thread_done: break if message == (None, None, None): continue method, properties, body = message self.logger.info("Received the following message: %s", body) self.messages.append(body) def check_internal_notification(self): if (len(self.messages) == 0): return None else: msg = self.messages[0] self.messages = self.messages[1:] return msg def doMigrate(self, vm_uuid, dst): self.openstack_client.live_migrate_vm(vm_uuid, dst) def find_server_overload_dst(self, vmid): ''' Returns the most appropriate destination host for the given vm. By "most appropriate" we mean: 1. Not the current vm's host 2. The least loaded (in vcpus) host 3. On tie, the host with the least gold vcpus 3. On second tie, we just return the first of the remaining hosts ''' self.logger.info("Starting find_server_overload_dst(%s)", vmid) servers = self.information_aggregator.getComputeNodes() ## Remove vm's current host current_host = self.information_aggregator.get_vm_current_host(vmid) if (current_host in servers): servers.remove(current_host) else: ## Probably current host is None and the VM is deleted self.logger.info("Current host %s not in servers list %s", current_host, servers) return None ## vcpus_per_server is a list of tuples with the following scheme: ## [ ("acticloud1", nr_total_vcpus, nr_gold_vcpus), (...), ... ] vcpus_per_server = [] for server in servers: (gold_vcpus, silver_vcpus ) = self.information_aggregator.getVcpusByComputeNode(server) vcpus_per_server.append( (server, gold_vcpus + silver_vcpus, gold_vcpus)) ## Sort by nr_total_vcpus vcpus_per_server.sort(key=lambda x: x[1]) ## Keep only those with the min nr_total_vcpus vcpus_per_server = [ x for x in vcpus_per_server if x[1] == vcpus_per_server[0][1] ] self.logger.info("vcpus_per_server: %s", vcpus_per_server) ## Now sort by nr_gold_vcpus vcpus_per_server.sort(key=lambda x: x[2]) ## We now have our destination dst = vcpus_per_server[0][0] dst_gold_vcpus = vcpus_per_server[0][2] dst_silver_vcpus = vcpus_per_server[0][1] - dst_gold_vcpus ## Finally, check if the given VM can fit in the destination host vm_vcpus = self.information_aggregator.get_vm_nr_vcpus(vmid) if self.information_aggregator.is_gold_vm(vmid): if vm_vcpus <= (float(pcpus - dst_gold_vcpus) - float(dst_silver_vcpus) / os_limit): return dst else: if vm_vcpus <= ( (pcpus - dst_gold_vcpus) * os_limit - dst_silver_vcpus): return dst ## The VM does not fit in the destination host return None def handle_server_overload(self, events): servers = self.information_aggregator.getComputeNodes() ol_events = events['SERVER_OVERLOAD'] messages = [0 for s in servers] for (host, vmid) in ol_events: ## If the VM's current host is different than the one that sent the message, ## the VM has probably already been migrated current_host = self.information_aggregator.get_vm_current_host( vmid) if (current_host != host): continue messages[servers.index(host)] += 1 ol_servers = sum([int(bool(x)) for x in messages]) overload_pct = float(ol_servers) / len(servers) handled = [False for s in servers] actions = 0 if overload_pct < overload_threshold1: actions = ol_servers elif overload_pct < overload_threshold2: actions = ol_servers / 2 else: # complain to remote cloud return for (host, vmid) in ol_events: if not actions: break if not handled[servers.index(host)]: actions -= 1 dst = self.find_server_overload_dst(vmid) if dst == None: ## FIXME nowhere to put the new VM, possibly pause it?? continue self.doMigrate(vmid, dst) handled[servers.index(host)] = True def execute(self): while (1): events = dict() events["SERVER_OVERLOAD"] = [] events["INTERFERENCE"] = [] message = self.check_internal_notification() has_overload = False while message != None: self.logger.info("GOT MESSAGE: %s", message) message_type = message.split(' ')[0] if message_type == "SERVER_OVERLOAD": hostname = message.split(' ')[1] hint_vmid = message.split(' ')[2] events[message_type].append((hostname, hint_vmid)) has_overload = True elif message_type == "INTERFERENCE": vmid = message.split()[1] dst = self.find_server_overload_dst(vmid) self.logger.info( "Handling interference of VM %s by migrating to host %s", vmid, dst) if dst != None: self.doMigrate(vmid, dst) else: self.logger.info( "Could not find a valid destination host for VM %s", vmid) else: pass message = self.check_internal_notification() if has_overload: self.handle_server_overload(events) self.logger.info("Sleeping for DP=%d seconds" % self.poll_period) time.sleep(self.poll_period)
from openstack_client import OpenstackClient opts = [ cfg.StrOpt('default_deploy_kernel_id', help='Deploy kernel image used by the synch mechanism'), cfg.StrOpt('default_deploy_ramdisk_id', help='Deploy ramdisk image used by the synch mechanism'), cfg.StrOpt('default_sync_driver', help='Default driver to synch with OneView'), ] CONF = cfg.CONF CONF.register_opts(opts, group='ironic') CONF(default_config_files=['sync.conf']) os_client = OpenstackClient() ov_client = OneViewClient() sh_api = ov_client.server_hardware_api def get_config_options(): return CONF #=============================================================================== # Ironic actions #=============================================================================== def get_ironic_client(): return os_client._get_ironic_client()
class InterferenceDetector(): def __init__(self, hostname): self.logger = logging.getLogger(self.__class__.__name__) self.hostname = hostname ## Initialize the acticloudDB client self.acticloudDB_client = ActiCloudDBClient() ## Openstack client to get the necessary information when needed self.openstack_client = OpenstackClient() ## VMs that are being monitored, indexed by uuid self.monitored_vms = dict() ## Spawned perf threads self.perf_threads = [] # vm: openstack VM object def check_interference(self, vm): ## Find (if exists) the model of the current benchmark model = self._get_model(vm) ## If no model exists in our database, we don't know about interference if (model == None): return False ## If the model was found we know ... inp_file = self._get_perf_output_filename(vm) if (not os.path.isfile(inp_file)): self.logger.error("Could not find a current perf output file for VM %s", vm.id) return False ## Everything is in place, checkout about interference (num_of_clusters, train_axis, train_labels, model, dev, train_metr, pca, scaler1, scaler2) = model has_interference = HealthyStateModel.model_test_dy(train_axis, inp_file, \ num_of_clusters, train_labels, model, dev, train_metr, \ pca, scaler1, scaler2) return has_interference # vm: openstack VM object def add_vm(self, vm): if vm in self.monitored_vms: self.logger.info("VM %s is already being monitored", vm.id) return self.logger.info("Adding VM %s", vm.id) self.monitored_vms[vm] = dict() # vm: openstack VM object def remove_vm(self, vm): if not vm in self.monitored_vms: self.logger.info("VM %s is not being monitored", vm.id) return self.logger.info("Removing VM %s", vm.id) del self.monitored_vms[vm] def remove_vm_by_uuid(self, vm_uuid): for vm in self.monitored_vms: if vm_uuid == vm.id: self.remove_vm(vm) return self.logger.info("VM with UUID=%s not found in monitored VMs", vm_uuid) def remove_all_vms(self): self.monitored_vms = dict() ## Spawns a perf_thread per monitored VM def start_perf_threads(self): for vm in self.monitored_vms: vm_uuid = vm.id vm_pid = self._get_pid_from_vm_id(vm) if vm_pid == -1: self.logger.info("Could not get the PID of VM %s", vm_uuid) continue t = threading.Thread(target=self._perf_thread_function, args=(vm, vm_uuid, vm_pid,)) t.start() self.perf_threads.append(t) def stop_perf_threads(self): for t in self.perf_threads: t.join() # vm: openstack VM object def _get_model(self, vm): vm_name = vm.name tokens = vm_name.split("-") if (len(tokens) < 4): model = None ## Could not get benchmark name, no model available else: bench_name = tokens[2] + "-" + tokens[3] ## FIXME this is not correct for stress nr_vcpus = int(self.openstack_client.get_flavor_by_id(vm.flavor["id"]).vcpus) model = self.acticloudDB_client.get_model(bench_name, nr_vcpus) if model == None: self.logger.info("Healthy state model NOT FOUND for VM %s", vm.id) else: self.logger.info("Healthy state model FOUND for VM %s (db entry: %s, %d)", vm.id, bench_name, nr_vcpus) return model def _get_perf_output_filename(self, vm): return perf_outputs_dir + "/" + vm.id + ".perf" ## FIXME the following are temporary # tokens = vm.name.split("-") # bench_name = tokens[2] # if "spec" in bench_name: # bench_name = bench_name + "-" + tokens[3] # nr_vcpus = int(openstack_client.get_flavor_by_id(vm.flavor["id"]).vcpus) # return perf_outputs_dir + "/" + bench_name + "-" + str(nr_vcpus) + "vcpus.perf" def _write_perf_output_to_file(self, vm, vm_uuid, output): output_file = self._get_perf_output_filename(vm) f = open(output_file, "w") f.write(output) f.close() def _perf_thread_function(self, vm, vm_uuid, vm_pid): self.logger.info("Starting perf command for VM %s", vm_uuid) perf_cmd = ("ssh %(hostname)s perf kvm --guest stat -e %(events)s " + \ "-I %(interval)s -p %(pid)d sleep %(runtime)d") % \ {'hostname': self.hostname, 'events': perf_events, 'interval': perf_interval, 'pid': vm_pid, 'runtime': perf_window} self.logger.info("Running perf command for VM %s (PID: %d)", vm.id, vm_pid) status, output = commands.getstatusoutput(perf_cmd) if status != 0: self.logger.info("Something went wrong with the perf command: %s", output) return self.monitored_vms[vm]['last_perf_output'] = output self._write_perf_output_to_file(vm, vm_uuid, output) ## vm: openstack VM object ## returns pid: int, on error -1 is returned def _get_pid_from_vm_id(self, vm): try: libvirt_instance_name = getattr(vm, 'OS-EXT-SRV-ATTR:instance_name') command = "ssh %s ps -ef | grep %s | grep \"qemu-system\" | grep -v grep | awk '{print $2}'" % (self.hostname, libvirt_instance_name) pid = commands.getoutput(command) return int(pid) except: return -1
## Setup the logging facility logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(name)20s - %(message)s', datefmt='%Y-%m-%d.%H:%M:%S') logging.Formatter.converter = time.gmtime logger = logging.getLogger("interference-detector") ## Check and then read arguments if (len(sys.argv) < 2): logger.error("usage: %s <hostname>", sys.argv[0]) sys.exit(1) hostname = sys.argv[1] ## Initialize openstack client openstack_client = OpenstackClient() ## Initialize Interference Detector detector = InterferenceDetector(hostname) while 1: ## Delete all previously monitored VMs detector.remove_all_vms() ## Add all the VMs to be monitored for vm in openstack_client.get_vms_by_hostname(hostname): if "acticloud" in vm.name: detector.add_vm(vm) ## Start and wait for perf threads to finish detector.start_perf_threads()