class ExpMachine(GuestMachine): def __init__(self, vm_name, output_path, exp_config: DictConfig, host_config: DictConfig, application: application.Application, benchmark: cloudexp.guest.application.benchmark.Benchmark, guest_mom_config: Union[DictConfig, dict, None] = None, shared_terminable: Terminable = None, shared_deferred_start: DeferredStartThread = None, **props): libvirt_uri = host_config.get('main', 'libvirt-hypervisor-uri') username = exp_config.get('guest-credentials', 'username') password = exp_config.get('guest-credentials', 'password') GuestMachine.__init__(self, vm_name, libvirt_uri, username, password, application.get_image_name(), shared_terminable=shared_terminable, **props) self.output_path = output_path self.exp_config = exp_config self.host_conf = host_config self.guest_config = DictConfig(momguestd.DEFAULT_CONFIG, guest_mom_config) self.application = application self.benchmark = benchmark self.load_func = props.get('load_func', None) self.load_interval = props.get('load_interval', None) self.expected_load_rounds = props.get("expected_load_rounds", None) self.benchmark.set_guest_machine(self) self.application.set_guest_server_port( self.host_conf.get("guest-client", "port")) self.guest_client = load_guest_client(self.ip, self.vm_name, self.host_conf, base_name='exp-machine-client') self.benchmark_thread = BenchmarkExecutor( self.benchmark, self.load_func, self.load_interval, self.expected_load_rounds, name=self.vm_name, shared_terminable=shared_terminable, shared_deferred_start=shared_deferred_start) self.threads = {} self.ssh_connections = {} self._is_ready_for_experiment = Event() def as_dict(self): return dict( GuestMachine.as_dict(self), load_interval=self.load_interval, bm_cpus=self.get_benchmark_cpus(), guest_config=self.guest_config.get_dict(), ) def get_benchmark_cpu_count(self): return self.benchmark.get_required_cpus() def set_benchmark_cpus(self, bm_cpus): self.benchmark.set_benchmark_cpus(bm_cpus) def get_benchmark_cpus(self): return self.benchmark.get_benchmark_cpus() def init_experiment(self): self.logger.info("Initiating experiment on VM...") try: self.start_domain(restart_if_activated=False) self.wait_for_ssh_server() self.set_guest_host_name() self.set_vm_props() self.log_vm_sysctl_properties() self.install_framework() self.install_application() self.disable_cron() self.drop_caches() self.start_guest_server() self.guest_client.wait_for_server(shared_terminable=self) self.start_application() self.benchmark.wait_for_application() # Start benchmark thread self.benchmark_thread.start() except Exception as e: self.logger.exception("Failed to initiate experiment: %s", e) raise e else: self._is_ready_for_experiment.set() @property def is_ready_for_experiment(self): return self._is_ready_for_experiment.is_set() def end_experiment(self): self.terminate() self.benchmark_thread.terminate() self.close_client() self._is_ready_for_experiment.clear() self.logger.info("Waiting for remote guest server to end...") self.join_guest_server(20) self.logger.info("Waiting for remote program to end...") self.join_application(20) def close_client(self): if self.guest_client is not None: self.guest_client.close() def disable_cron(self): self.logger.info("Disabling Cron...") ssh_cron = self.ssh("service", "cron", "stop", name="disable-cron") ssh_cron.communicate() # short blocking if not ssh_cron.err: self.logger.info("Cron disabled: %s", ssh_cron.out.replace("\n", " ").strip()) else: self.logger.error("Fail to disable cron: out=%s, err=%s", ssh_cron.out, ssh_cron.err) def drop_caches(self): ssh_sync = self.ssh("sync", name="sync-cache") ssh_sync.communicate() # short blocking if ssh_sync.err: self.logger.error("Fail to sync cache to secondary memory: %s", ssh_sync.err) ssh_drop = self.ssh("echo 3 > /proc/sys/vm/drop_caches", name="drop-cache") if ssh_drop.err: self.logger.error("Fail to drop caches: %s", ssh_drop.err) self.logger.info( "Dropped caches.%s%s", (" [sync: %s]" % ssh_sync.out) if ssh_sync.out else "", (" [drop: %s]" % ssh_drop.out) if ssh_drop.out else "") def start_background_thread(self, target, name_prefix=None, args=(), kwargs=None): if not self.should_run: return name = f"{name_prefix}-{self.vm_name}" self.logger.info("Starting background thread: %s", name) try: t = LoggedThread(target=target, name=name, args=args, kwargs=kwargs, daemon=True) self.threads[name] = t t.start() except Exception as e: self.logger.exception("Failed to initiated thread '%s': %s", name, e) def _remote_function_call_thread(self, name, remote_function: Callable, *args, **kwargs): output_file = os.path.join(self.output_path, f'{name}-{self.vm_name}.log') conn = self.remote_function_call(remote_function, *args, output_file=output_file, **kwargs) self.ssh_connections[name] = conn out, err = conn.communicate() # blocking if self.should_run: self.logger.error( "%s crashed before its time - [out]: %s, [err]: %s", name, out, err) def start_remote_function_call_thread(self, name, remote_function: Callable, *args, **kwargs): self.start_background_thread(self._remote_function_call_thread, name, args=(name, remote_function, *args), kwargs=kwargs) def start_application(self): self.start_remote_function_call_thread( "application", remote_application, self.application, self.guest_config.get('logging', 'verbosity')) def start_guest_server(self): self.start_remote_function_call_thread("guest-server", remote_guest_server, self.guest_config.get( 'logging', 'verbosity'), self.guest_config, guest_name=self.vm_name) def terminate_ssh(self, ssh_name, timeout): ssh_thread = self.ssh_connections.get(ssh_name, None) if ssh_thread is None: return if ssh_thread.is_alive(): ssh_thread.terminate() ssh_thread.join(timeout) def join_application(self, timeout=None): self.terminate_ssh('application', timeout) def join_guest_server(self, timeout=None): self.terminate_ssh('guest-server', timeout) def install_framework(self): self.logger.info("Copy and install Python's code...") exclude = '*.pyc', '*.pyo', '__pycache__', '.*' self.rsync_retry(cloudexp.repository_path, '~', name='framework', exclude=exclude, delete=True) args = ['guest_setup.py', 'develop', '--user'] out, err = self.ssh(*args, cwd=cloudexp.repository_name, is_python=True, is_module=False, name="install-packages").communicate() if err: raise GuestMachineException( f"Could not install framework:\nOUT: {out}\nERR: {err}") def install_application(self): bin_path = self.application.get_bin_path() if bin_path is None: self.logger.info( "Application does not require to install binaries...") return self.logger.info("Copy and install application's binaries...") if type(bin_path) not in (list, tuple): assert isinstance(bin_path, str), "Binaries path must be a string." bin_path = (bin_path, ) for p in bin_path: self.rsync_retry(p, '~', name='app-binaries', delete=True)
class Experiment(LoggedObject, Terminable): def __init__(self, vms_desc: dict, duration: int, output_path: str, host_mom_config: DictConfigType = None, exp_config: DictConfigType = None, extra_info: Optional[dict] = None): """ Parameters: ----------- vms_desc : descriptor of the virtual machines duration : the experiment duration output_path : the experiment output folder path host_mom_config : override configurations in default MOM config exp_config: override configurations in default experiment config extra_info : extra information to write to the info file """ LoggedObject.__init__(self) Terminable.__init__(self) # Only this experiment will catch sigint and will terminate the experiment self.terminate_on_signal(signal.SIGINT) self.duration = duration self.output_path = output_path self.extra_info = extra_info cpu_settings.disable_hyper_threading() self.host_machine = HostMachine() self.host_machine.set_owner_cpu_count('system', 2) self.exp_config = DictConfig(DEFAULT_EXP_CONFIG, exp_config) self.mom_executor = MomThreadExecutor(host_mom_config, shared_terminable=self) self.mom = self.mom_executor.mom self.host_config = self.mom.config self.threads = [] # Start the connection to libvirt libvirt_uri = self.host_config.get('main', 'libvirt-hypervisor-uri') self.conn = libvirt.open(libvirt_uri) self.host_name = self.conn.getHostname() self.max_vcpus = self.conn.getMaxVcpus(self.conn.getType()) # Get the machines. This will update the available CPU after pinning the CPUs to the VMs self.number_of_guests = 0 self.vms = self.get_all_vms_instances(vms_desc) # save test information file try: self.write_info(self.output_path) except Exception as e: self.logger.error("Could not write info file: %s", e) self.logger.debug("Python version: %s", sys.version) self.logger.debug('Host name: %s', self.host_name) self.logger.debug('Max qemu vCPUs: %i', self.max_vcpus) self.logger.debug('Allocated CPUs: %s', pprint.pformat(self.host_machine.cpu_owners)) self.logger.info("Experiment initiated: %s", output_path) def keyboard_interrupt_handler(self, signalnum, frame): self.logger.warning( "Terminating experiment due to keyboard interrupt.") Terminable.keyboard_interrupt_handler(self, signalnum, frame) def as_dict(self): info = dict( duration=self.duration, vms={vm.vm_name: vm.as_dict() for vm in self.vms}, output_path=self.output_path, host_machine=self.host_machine.as_dict(), host_name=self.host_name, max_vcpus=self.max_vcpus, exp_config=self.exp_config.get_dict(), host_config=self.host_config.get_dict(), ) # Add extra_info to info dict (with respect to overlapping keys) if self.extra_info: for inf_key, inf_val in self.extra_info.items(): if inf_key not in info: info[inf_key] = inf_val elif info[inf_key] != inf_val: info[inf_key + "(extra-info)"] = inf_val return info def get_output_file_path(self, file_name): return os.path.join(self.output_path, file_name) def get_machine(self, name, props): self.number_of_guests += 1 return ExpMachine(name, self.output_path, self.exp_config, self.host_config, shared_terminable=self, shared_deferred_start=self.mom_executor, **props) def get_all_vms_instances(self, vms_desc: dict): ret = [] for name, props in vms_desc.items(): ret.append(self.get_machine(name, props)) for vm in ret: if vm.cpu_pin is None: vm_cpus = self.host_machine.set_owner_cpu_count( vm.vm_name, vm.get_required_cpu_count()) vm.set_cpu_pin(vm_cpus) else: self.host_machine.set_owner_cpus(vm.vm_name, vm.cpu_pin) for vm in ret: bm_cpus = self.host_machine.set_owner_cpu_count( f"{vm.vm_name}-benchmark", vm.get_benchmark_cpu_count()) vm.set_benchmark_cpus(bm_cpus) return ret def parallel_for_each_vm(self, func, thread_name_prefix='experiment-vm-pool'): self.logger.info("Starting parallel VM call: %s", thread_name_prefix) with ThreadPoolExecutor(thread_name_prefix=thread_name_prefix) as e: for vm, res in zip(self.vms, e.map(func, self.vms)): self.logger.debug("%s: %s finished successfully: %s", vm.vm_name, thread_name_prefix, res) def close(self): if self.conn: self.conn.close() self.host_machine.end_experiment() def destroy_vms_experiment(self): try: self.logger.info("End experiment for each VMs...") self.parallel_for_each_vm(lambda vm_m: vm_m.end_experiment(), "end-experiment") except Exception as ex: self.logger.exception("Error ending VMs: %s", ex) self.logger.critical("Experiment failed due to exception: %s", ex) finally: self.logger.info("Destroying VMs...") self.parallel_for_each_vm(lambda vm_m: vm_m.destroy_domain(), "destroy-domain") def start_experiment(self): begin_time = time.time() try: begin_time = self.run_everything() self.logger.info("Running experiment for duration of %s", time_delta(self.duration)) self.terminable_sleep(self.duration) except Exception as ex: self.logger.exception("EXCEPTION: %s", ex) self.logger.critical("Experiment failed due to exception: %s", ex) self.parallel_for_each_vm(lambda vm_m: vm_m.destroy_domain(), "destroy-domain") finally: end_time = time.time() self.logger.info("Experiment ended after %s", time_delta(end_time - begin_time)) self.end_everything() def write_vms_desc(self): for vm in self.vms: with open(self.get_output_file_path(f"{vm.vm_name}.xml"), "w") as f: f.write(vm.get_xml_desc()) def destroy_all_active_vms(self): self.logger.info("Destroying active VMs...") domain_list = self.conn.listDomainsID() for guest_id in domain_list: dom = self.conn.lookupByID(guest_id) dom.destroy() def start_experiment_vms(self): self.logger.info("Starting VMs...") # libvirt needs domain to start serially: for vm in self.vms: vm.start_domain() def run_everything(self): cpu_settings.set_cpu_governor('performance') self.host_machine.begin_experiment() self.destroy_all_active_vms() self.start_experiment_vms() self.write_vms_desc() # all initiations can be done in parallel: self.parallel_for_each_vm(lambda vm_m: vm_m.init_experiment(), "init-experiment") # Assert the all initiation succeeded if not all(vm.is_ready_for_experiment for vm in self.vms): raise Exception("VMs are not ready for experiment") self.logger.info( "Start MOM and wait for VMs. When finish, trigger start event and load workers..." ) self.mom_executor.start_and_wait_for_guests(self.number_of_guests) return self.mom_executor.start_time def end_everything(self): try: self.logger.info("Ending MOM") self.mom_executor.terminate() except Exception as ex: self.logger.exception("Error ending MOM: %s", ex) self.logger.critical("Experiment failed due to exception: %s", ex) self.destroy_vms_experiment() try: if self.mom_executor.is_alive(): self.mom_executor.join(30) except Exception as ex: self.logger.exception("Error waiting for MOM: %s", ex) self.logger.critical("Experiment failed due to exception: %s", ex) cpu_settings.enable_hyper_threading() cpu_settings.set_cpu_governor('powersave')