예제 #1
0
class ExpMachine(GuestMachine):
    def __init__(self,
                 vm_name,
                 output_path,
                 exp_config: DictConfig,
                 host_config: DictConfig,
                 application: application.Application,
                 benchmark: cloudexp.guest.application.benchmark.Benchmark,
                 guest_mom_config: Union[DictConfig, dict, None] = None,
                 shared_terminable: Terminable = None,
                 shared_deferred_start: DeferredStartThread = None,
                 **props):
        libvirt_uri = host_config.get('main', 'libvirt-hypervisor-uri')
        username = exp_config.get('guest-credentials', 'username')
        password = exp_config.get('guest-credentials', 'password')
        GuestMachine.__init__(self,
                              vm_name,
                              libvirt_uri,
                              username,
                              password,
                              application.get_image_name(),
                              shared_terminable=shared_terminable,
                              **props)

        self.output_path = output_path
        self.exp_config = exp_config
        self.host_conf = host_config
        self.guest_config = DictConfig(momguestd.DEFAULT_CONFIG,
                                       guest_mom_config)

        self.application = application
        self.benchmark = benchmark

        self.load_func = props.get('load_func', None)
        self.load_interval = props.get('load_interval', None)
        self.expected_load_rounds = props.get("expected_load_rounds", None)

        self.benchmark.set_guest_machine(self)
        self.application.set_guest_server_port(
            self.host_conf.get("guest-client", "port"))
        self.guest_client = load_guest_client(self.ip,
                                              self.vm_name,
                                              self.host_conf,
                                              base_name='exp-machine-client')
        self.benchmark_thread = BenchmarkExecutor(
            self.benchmark,
            self.load_func,
            self.load_interval,
            self.expected_load_rounds,
            name=self.vm_name,
            shared_terminable=shared_terminable,
            shared_deferred_start=shared_deferred_start)

        self.threads = {}
        self.ssh_connections = {}
        self._is_ready_for_experiment = Event()

    def as_dict(self):
        return dict(
            GuestMachine.as_dict(self),
            load_interval=self.load_interval,
            bm_cpus=self.get_benchmark_cpus(),
            guest_config=self.guest_config.get_dict(),
        )

    def get_benchmark_cpu_count(self):
        return self.benchmark.get_required_cpus()

    def set_benchmark_cpus(self, bm_cpus):
        self.benchmark.set_benchmark_cpus(bm_cpus)

    def get_benchmark_cpus(self):
        return self.benchmark.get_benchmark_cpus()

    def init_experiment(self):
        self.logger.info("Initiating experiment on VM...")

        try:
            self.start_domain(restart_if_activated=False)
            self.wait_for_ssh_server()
            self.set_guest_host_name()
            self.set_vm_props()
            self.log_vm_sysctl_properties()
            self.install_framework()
            self.install_application()
            self.disable_cron()
            self.drop_caches()

            self.start_guest_server()
            self.guest_client.wait_for_server(shared_terminable=self)

            self.start_application()
            self.benchmark.wait_for_application()

            # Start benchmark thread
            self.benchmark_thread.start()
        except Exception as e:
            self.logger.exception("Failed to initiate experiment: %s", e)
            raise e
        else:
            self._is_ready_for_experiment.set()

    @property
    def is_ready_for_experiment(self):
        return self._is_ready_for_experiment.is_set()

    def end_experiment(self):
        self.terminate()
        self.benchmark_thread.terminate()
        self.close_client()
        self._is_ready_for_experiment.clear()

        self.logger.info("Waiting for remote guest server to end...")
        self.join_guest_server(20)
        self.logger.info("Waiting for remote program to end...")
        self.join_application(20)

    def close_client(self):
        if self.guest_client is not None:
            self.guest_client.close()

    def disable_cron(self):
        self.logger.info("Disabling Cron...")
        ssh_cron = self.ssh("service", "cron", "stop", name="disable-cron")

        ssh_cron.communicate()  # short blocking

        if not ssh_cron.err:
            self.logger.info("Cron disabled: %s",
                             ssh_cron.out.replace("\n", " ").strip())
        else:
            self.logger.error("Fail to disable cron: out=%s, err=%s",
                              ssh_cron.out, ssh_cron.err)

    def drop_caches(self):
        ssh_sync = self.ssh("sync", name="sync-cache")
        ssh_sync.communicate()  # short blocking

        if ssh_sync.err:
            self.logger.error("Fail to sync cache to secondary memory: %s",
                              ssh_sync.err)

        ssh_drop = self.ssh("echo 3 > /proc/sys/vm/drop_caches",
                            name="drop-cache")
        if ssh_drop.err:
            self.logger.error("Fail to drop caches: %s", ssh_drop.err)

        self.logger.info(
            "Dropped caches.%s%s",
            (" [sync: %s]" % ssh_sync.out) if ssh_sync.out else "",
            (" [drop: %s]" % ssh_drop.out) if ssh_drop.out else "")

    def start_background_thread(self,
                                target,
                                name_prefix=None,
                                args=(),
                                kwargs=None):
        if not self.should_run:
            return

        name = f"{name_prefix}-{self.vm_name}"

        self.logger.info("Starting background thread: %s", name)
        try:
            t = LoggedThread(target=target,
                             name=name,
                             args=args,
                             kwargs=kwargs,
                             daemon=True)
            self.threads[name] = t
            t.start()
        except Exception as e:
            self.logger.exception("Failed to initiated thread '%s': %s", name,
                                  e)

    def _remote_function_call_thread(self, name, remote_function: Callable,
                                     *args, **kwargs):
        output_file = os.path.join(self.output_path,
                                   f'{name}-{self.vm_name}.log')
        conn = self.remote_function_call(remote_function,
                                         *args,
                                         output_file=output_file,
                                         **kwargs)
        self.ssh_connections[name] = conn
        out, err = conn.communicate()  # blocking
        if self.should_run:
            self.logger.error(
                "%s crashed before its time - [out]: %s, [err]: %s", name, out,
                err)

    def start_remote_function_call_thread(self, name,
                                          remote_function: Callable, *args,
                                          **kwargs):
        self.start_background_thread(self._remote_function_call_thread,
                                     name,
                                     args=(name, remote_function, *args),
                                     kwargs=kwargs)

    def start_application(self):
        self.start_remote_function_call_thread(
            "application", remote_application, self.application,
            self.guest_config.get('logging', 'verbosity'))

    def start_guest_server(self):
        self.start_remote_function_call_thread("guest-server",
                                               remote_guest_server,
                                               self.guest_config.get(
                                                   'logging', 'verbosity'),
                                               self.guest_config,
                                               guest_name=self.vm_name)

    def terminate_ssh(self, ssh_name, timeout):
        ssh_thread = self.ssh_connections.get(ssh_name, None)
        if ssh_thread is None:
            return
        if ssh_thread.is_alive():
            ssh_thread.terminate()
            ssh_thread.join(timeout)

    def join_application(self, timeout=None):
        self.terminate_ssh('application', timeout)

    def join_guest_server(self, timeout=None):
        self.terminate_ssh('guest-server', timeout)

    def install_framework(self):
        self.logger.info("Copy and install Python's code...")

        exclude = '*.pyc', '*.pyo', '__pycache__', '.*'
        self.rsync_retry(cloudexp.repository_path,
                         '~',
                         name='framework',
                         exclude=exclude,
                         delete=True)

        args = ['guest_setup.py', 'develop', '--user']
        out, err = self.ssh(*args,
                            cwd=cloudexp.repository_name,
                            is_python=True,
                            is_module=False,
                            name="install-packages").communicate()
        if err:
            raise GuestMachineException(
                f"Could not install framework:\nOUT: {out}\nERR: {err}")

    def install_application(self):
        bin_path = self.application.get_bin_path()
        if bin_path is None:
            self.logger.info(
                "Application does not require to install binaries...")
            return
        self.logger.info("Copy and install application's binaries...")
        if type(bin_path) not in (list, tuple):
            assert isinstance(bin_path, str), "Binaries path must be a string."
            bin_path = (bin_path, )

        for p in bin_path:
            self.rsync_retry(p, '~', name='app-binaries', delete=True)
예제 #2
0
class Experiment(LoggedObject, Terminable):
    def __init__(self,
                 vms_desc: dict,
                 duration: int,
                 output_path: str,
                 host_mom_config: DictConfigType = None,
                 exp_config: DictConfigType = None,
                 extra_info: Optional[dict] = None):
        """
        Parameters:
        -----------
        vms_desc : descriptor of the virtual machines
        duration : the experiment duration
        output_path : the experiment output folder path
        host_mom_config : override configurations in default MOM config
        exp_config: override configurations in default experiment config
        extra_info : extra information to write to the info file
        """
        LoggedObject.__init__(self)
        Terminable.__init__(self)
        # Only this experiment will catch sigint and will terminate the experiment
        self.terminate_on_signal(signal.SIGINT)

        self.duration = duration
        self.output_path = output_path
        self.extra_info = extra_info

        cpu_settings.disable_hyper_threading()
        self.host_machine = HostMachine()
        self.host_machine.set_owner_cpu_count('system', 2)

        self.exp_config = DictConfig(DEFAULT_EXP_CONFIG, exp_config)

        self.mom_executor = MomThreadExecutor(host_mom_config,
                                              shared_terminable=self)
        self.mom = self.mom_executor.mom
        self.host_config = self.mom.config
        self.threads = []

        # Start the connection to libvirt
        libvirt_uri = self.host_config.get('main', 'libvirt-hypervisor-uri')
        self.conn = libvirt.open(libvirt_uri)

        self.host_name = self.conn.getHostname()
        self.max_vcpus = self.conn.getMaxVcpus(self.conn.getType())

        # Get the machines. This will update the available CPU after pinning the CPUs to the VMs
        self.number_of_guests = 0
        self.vms = self.get_all_vms_instances(vms_desc)

        # save test information file
        try:
            self.write_info(self.output_path)
        except Exception as e:
            self.logger.error("Could not write info file: %s", e)

        self.logger.debug("Python version: %s", sys.version)
        self.logger.debug('Host name: %s', self.host_name)
        self.logger.debug('Max qemu vCPUs: %i', self.max_vcpus)
        self.logger.debug('Allocated CPUs: %s',
                          pprint.pformat(self.host_machine.cpu_owners))
        self.logger.info("Experiment initiated: %s", output_path)

    def keyboard_interrupt_handler(self, signalnum, frame):
        self.logger.warning(
            "Terminating experiment due to keyboard interrupt.")
        Terminable.keyboard_interrupt_handler(self, signalnum, frame)

    def as_dict(self):
        info = dict(
            duration=self.duration,
            vms={vm.vm_name: vm.as_dict()
                 for vm in self.vms},
            output_path=self.output_path,
            host_machine=self.host_machine.as_dict(),
            host_name=self.host_name,
            max_vcpus=self.max_vcpus,
            exp_config=self.exp_config.get_dict(),
            host_config=self.host_config.get_dict(),
        )

        # Add extra_info to info dict (with respect to overlapping keys)
        if self.extra_info:
            for inf_key, inf_val in self.extra_info.items():
                if inf_key not in info:
                    info[inf_key] = inf_val
                elif info[inf_key] != inf_val:
                    info[inf_key + "(extra-info)"] = inf_val

        return info

    def get_output_file_path(self, file_name):
        return os.path.join(self.output_path, file_name)

    def get_machine(self, name, props):
        self.number_of_guests += 1
        return ExpMachine(name,
                          self.output_path,
                          self.exp_config,
                          self.host_config,
                          shared_terminable=self,
                          shared_deferred_start=self.mom_executor,
                          **props)

    def get_all_vms_instances(self, vms_desc: dict):
        ret = []

        for name, props in vms_desc.items():
            ret.append(self.get_machine(name, props))

        for vm in ret:
            if vm.cpu_pin is None:
                vm_cpus = self.host_machine.set_owner_cpu_count(
                    vm.vm_name, vm.get_required_cpu_count())
                vm.set_cpu_pin(vm_cpus)
            else:
                self.host_machine.set_owner_cpus(vm.vm_name, vm.cpu_pin)

        for vm in ret:
            bm_cpus = self.host_machine.set_owner_cpu_count(
                f"{vm.vm_name}-benchmark", vm.get_benchmark_cpu_count())
            vm.set_benchmark_cpus(bm_cpus)

        return ret

    def parallel_for_each_vm(self,
                             func,
                             thread_name_prefix='experiment-vm-pool'):
        self.logger.info("Starting parallel VM call: %s", thread_name_prefix)
        with ThreadPoolExecutor(thread_name_prefix=thread_name_prefix) as e:
            for vm, res in zip(self.vms, e.map(func, self.vms)):
                self.logger.debug("%s: %s finished successfully: %s",
                                  vm.vm_name, thread_name_prefix, res)

    def close(self):
        if self.conn:
            self.conn.close()

        self.host_machine.end_experiment()

    def destroy_vms_experiment(self):
        try:
            self.logger.info("End experiment for each VMs...")
            self.parallel_for_each_vm(lambda vm_m: vm_m.end_experiment(),
                                      "end-experiment")
        except Exception as ex:
            self.logger.exception("Error ending VMs: %s", ex)
            self.logger.critical("Experiment failed due to exception: %s", ex)
        finally:
            self.logger.info("Destroying VMs...")
            self.parallel_for_each_vm(lambda vm_m: vm_m.destroy_domain(),
                                      "destroy-domain")

    def start_experiment(self):
        begin_time = time.time()
        try:
            begin_time = self.run_everything()
            self.logger.info("Running experiment for duration of %s",
                             time_delta(self.duration))
            self.terminable_sleep(self.duration)
        except Exception as ex:
            self.logger.exception("EXCEPTION: %s", ex)
            self.logger.critical("Experiment failed due to exception: %s", ex)
            self.parallel_for_each_vm(lambda vm_m: vm_m.destroy_domain(),
                                      "destroy-domain")
        finally:
            end_time = time.time()
            self.logger.info("Experiment ended after %s",
                             time_delta(end_time - begin_time))
            self.end_everything()

    def write_vms_desc(self):
        for vm in self.vms:
            with open(self.get_output_file_path(f"{vm.vm_name}.xml"),
                      "w") as f:
                f.write(vm.get_xml_desc())

    def destroy_all_active_vms(self):
        self.logger.info("Destroying active VMs...")
        domain_list = self.conn.listDomainsID()
        for guest_id in domain_list:
            dom = self.conn.lookupByID(guest_id)
            dom.destroy()

    def start_experiment_vms(self):
        self.logger.info("Starting VMs...")
        # libvirt needs domain to start serially:
        for vm in self.vms:
            vm.start_domain()

    def run_everything(self):
        cpu_settings.set_cpu_governor('performance')
        self.host_machine.begin_experiment()
        self.destroy_all_active_vms()
        self.start_experiment_vms()
        self.write_vms_desc()

        # all initiations can be done in parallel:
        self.parallel_for_each_vm(lambda vm_m: vm_m.init_experiment(),
                                  "init-experiment")

        # Assert the all initiation succeeded
        if not all(vm.is_ready_for_experiment for vm in self.vms):
            raise Exception("VMs are not ready for experiment")

        self.logger.info(
            "Start MOM and wait for VMs. When finish, trigger start event and load workers..."
        )
        self.mom_executor.start_and_wait_for_guests(self.number_of_guests)
        return self.mom_executor.start_time

    def end_everything(self):
        try:
            self.logger.info("Ending MOM")
            self.mom_executor.terminate()
        except Exception as ex:
            self.logger.exception("Error ending MOM: %s", ex)
            self.logger.critical("Experiment failed due to exception: %s", ex)

        self.destroy_vms_experiment()

        try:
            if self.mom_executor.is_alive():
                self.mom_executor.join(30)
        except Exception as ex:
            self.logger.exception("Error waiting for MOM: %s", ex)
            self.logger.critical("Experiment failed due to exception: %s", ex)

        cpu_settings.enable_hyper_threading()
        cpu_settings.set_cpu_governor('powersave')