def inner_main(mailer, on_first_invocation, config, args): out_file = config.results_filename() out_file_exists = os.path.exists(out_file) instr_dir = util.get_instr_json_dir(config) instr_dir_exists = os.path.exists(instr_dir) envlog_dir = util.get_envlog_dir(config) envlog_dir_exists = os.path.exists(envlog_dir) if out_file_exists and not os.path.isfile(out_file): util.fatal("Output file '%s' exists but is not a regular file" % out_file) if out_file_exists and on_first_invocation: util.fatal("Output results file '%s' already exists. " "Move the file away before running Krun." % out_file) if instr_dir_exists and on_first_invocation: util.fatal("Instrumentation dir '%s' exists." % instr_dir) if envlog_dir_exists and on_first_invocation: util.fatal("Env log dir '%s' exists." % envlog_dir) if not out_file_exists and not on_first_invocation: util.fatal("No results file to resume. Expected '%s'" % out_file) # Initialise platform instance and assign to VM defs. # This needs to be done early, so VM sanity checks can run. platform = detect_platform(mailer, config) platform.quick_mode = args.quick platform.no_user_change = args.no_user_change platform.no_tickless_check = args.no_tickless_check platform.no_pstate_check = args.no_pstate_check platform.hardware_reboots = args.hardware_reboots # Create the instrumentation directory if required if on_first_invocation: # We only want make a dir if >=1 VM is in instrumentation mode. for vm in config.VMS.itervalues(): if vm['vm_def'].instrument: util.make_instr_dir(config) break debug("Checking platform preliminaries") platform.check_preliminaries() # Make a bit of noise if this is a virtualised environment if platform.is_virtual(): warn( "This appears to be a virtualised host. The results will be flawed. " "Use bare-metal for reliable results!") platform.collect_audit() # At this point the config file is OK, and on-disk state is consistent, # so let's daemonise (if requested). if args.daemonise: util.daemonise() if not on_first_invocation: # output file must exist, due to check above assert (out_file_exists) debug("Using pre-recorded initial temperature readings") manifest = ManifestManager(config, platform) platform_temps = {} for sensor, tup in manifest.starting_temperatures.iteritems(): platform_temps[sensor] = tup[1] platform.starting_temperatures = platform_temps else: manifest = ManifestManager(config, platform, new_file=True) if manifest.num_execs_left == 0: # No executions, or all skipped fatal("Empty schedule!") try: info(("Wait %s secs to allow system to cool prior to " "collecting initial temperature readings") % config.TEMP_READ_PAUSE) # This part is wrapped in hooks, so that if daemons or networking are # taken down for process executions, then the initial temperature # reading gets the same treatment. util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS, ) platform.sleep(config.TEMP_READ_PAUSE) debug("Taking fresh initial temperature readings") platform.starting_temperatures = platform.take_temperature_readings( ) manifest.set_starting_temperatures(platform.starting_temperatures) # Write out an empty results file. After the initial reboot Krun # will expect this to exist. Results.ok_to_instantiate = True results = Results(config, platform) results.write_to_file() except: raise finally: util.run_shell_cmd_list(config.POST_EXECUTION_CMDS, ) log_path = config.log_filename(resume=False) util.log_and_mail(mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % log_path, bypass_limiter=True) util.reboot(manifest, platform) # Assign platform to VM defs -- needs to happen early for sanity checks util.assign_platform(config, platform) sanity_checks(config, platform) # Build job queue -- each job is an execution sched = ExecutionScheduler(config, mailer, platform, dry_run=args.dry_run) sched.run()
def run(self): """Benchmark execution starts here""" # In reboot mode, wait for the system to come up before we proceed if self.platform.hardware_reboots: debug("Waiting %s seconds for the system to come up." % str(STARTUP_WAIT_SECONDS)) self.platform.sleep(STARTUP_WAIT_SECONDS) # Important that the dmesg is collected after the above startup wait. # Otherwise we get spurious dmesg changes. self.platform.collect_starting_dmesg() assert self.manifest.num_execs_left > 0 self.platform.wait_for_temperature_sensors() bench, vm, variant = self.manifest.next_exec_key.split(":") key_pexec_idx = self.manifest.next_exec_key_index() job = ExecutionJob(self, vm, self.config.VMS[vm], bench, variant, self.config.BENCHMARKS[bench], key_pexec_idx) # Default to error state. This is the value the finally block will see # if an exception is raised inside the try block, otherwise it is # re-assigned based on the result of running the benchmark. flag = 'E' # Run the pre-exec commands, the benchmark and the post-exec commands. # These are wrapped in a try/except, so that the post-exec commands # are always executed, even if an exception has occurred. We only # reboot /after/ the post-exec commands have completed. results = None try: # Run the user's pre-process-execution commands. We can't put an # ETA estimate in the environment for the pre-commands as we have # not (and should not) load the results file into memory yet. # # It might seem tempting to move this outside the try block, to # ensure that post-hooks are only run if pre-hooks ran. We don't, # thus avoiding the case where only *part* of the pre-hooks run, # but the post-hooks then don't run. util.run_shell_cmd_list(self.config.PRE_EXECUTION_CMDS,) # We collect rough execution times separate from real results. The # reason for this is that, even if a benchmark crashes it takes # time and we need to account for this when making estimates. A # crashing benchmark will give an empty list of iteration times, # meaning we can't use 'raw_exec_result' below for estimates. exec_start_time = time.time() measurements, instr_data, flag = job.run(self.mailer, self.dry_run) exec_end_time = time.time() # Only now is it OK to load the results file into memory. Results.ok_to_instantiate = True results = Results(self.config, self.platform, results_file=self.config.results_filename()) # Bail early if the process execution needs to be re-run. if flag == "O": util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_post_cmd_env(results) ) info("Rebooting to re-run previous process execution") util.reboot(self.manifest, self.platform, update_count=False) # reboot() does not return raise RuntimeError("reached unreachable code!") # Store new result. results.append_exec_measurements(job.key, measurements, flag) # Store instrumentation data in a separate file if job.vm_info["vm_def"].instrument: key_exec_num = self.manifest.completed_exec_counts[job.key] util.dump_instr_json(job.key, key_exec_num, self.config, instr_data) eta_info = exec_end_time - exec_start_time if self.platform.hardware_reboots: # Add time taken to wait for system to come up if we are in # hardware-reboot mode. eta_info += STARTUP_WAIT_SECONDS results.eta_estimates[job.key].append(eta_info) self.manifest.update(flag) except Exception: raise finally: # Run the user's post-process-execution commands with updated # ETA estimates. Important that this happens *after* dumping # results, as the user is likely copying intermediate results to # another host. # _make_post_cmd_env() needs the results to make an ETA. If an # exception occurred in the above try block, there's a chance that # they have not have been loaded. if results is None: Results.ok_to_instantiate = True results = Results(self.config, self.platform, results_file=self.config.results_filename()) # If errors occured, set error flag in results file if self.platform.check_dmesg_for_changes(self.manifest) or \ flag == 'E': results.error_flag = True results.write_to_file() util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_post_cmd_env(results) ) tfmt = self.get_overall_time_estimate_formatter(results) if self.manifest.eta_avail_idx == self.manifest.next_exec_idx: # We just found out roughly how long the session has left, mail out. msg = "ETA for current session now known: %s" % tfmt.finish_str util.log_and_mail(self.mailer, debug, "ETA for Current Session Available", msg, bypass_limiter=True) info("{:<25s}: {} ({} from now)".format( "Estimated completion (whole session)", tfmt.finish_str, tfmt.delta_str)) info("%d executions left in scheduler queue" % self.manifest.num_execs_left) if self.manifest.num_execs_left > 0 and \ self.manifest.eta_avail_idx > self.manifest.next_exec_idx: info("Executions until ETA known: %s" % (self.manifest.eta_avail_idx - self.manifest.next_exec_idx)) # Although it would have been nice to have checked this prior to # running the execution, it depends on the results file, which we # should not load prior to the process execution. util.check_audit_unchanged(results, self.platform) assert self.manifest.num_execs_left >= 0 if self.manifest.num_execs_left > 0: # print info about the next job benchmark, vm_name, variant = \ self.manifest.next_exec_key.split(":") info("Next execution is '%s(%d)' (%s variant) under '%s'" % (benchmark, self.config.BENCHMARKS[benchmark], variant, vm_name)) tfmt = self.get_exec_estimate_time_formatter(job.key, results) info("{:<35s}: {} ({} from now)".format( "Estimated completion (next execution)", tfmt.finish_str, tfmt.delta_str)) info("Reboot in preparation for next execution") util.reboot(self.manifest, self.platform) elif self.manifest.num_execs_left == 0: self.platform.save_power() if self.config.ENABLE_PINNING: self.platform.clear_cpu_pinning() info("Done: Results dumped to %s" % self.config.results_filename()) err_msg = "Errors/warnings occurred -- read the log!" if results.error_flag: warn(err_msg) msg = "Session completed. Log file at: '%s'" % (self.log_path) if results.error_flag: msg += "\n\n%s" % err_msg msg += "\n\nDon't forget to disable Krun at boot." util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg, bypass_limiter=True)
def inner_main(mailer, on_first_invocation, config, args): out_file = config.results_filename() out_file_exists = os.path.exists(out_file) instr_dir = util.get_instr_json_dir(config) instr_dir_exists = os.path.exists(instr_dir) envlog_dir = util.get_envlog_dir(config) envlog_dir_exists = os.path.exists(envlog_dir) if out_file_exists and not os.path.isfile(out_file): util.fatal( "Output file '%s' exists but is not a regular file" % out_file) if out_file_exists and on_first_invocation: util.fatal("Output results file '%s' already exists. " "Move the file away before running Krun." % out_file) if instr_dir_exists and on_first_invocation: util.fatal("Instrumentation dir '%s' exists." % instr_dir) if envlog_dir_exists and on_first_invocation: util.fatal("Env log dir '%s' exists." % envlog_dir) if not out_file_exists and not on_first_invocation: util.fatal("No results file to resume. Expected '%s'" % out_file) # Initialise platform instance and assign to VM defs. # This needs to be done early, so VM sanity checks can run. platform = detect_platform(mailer, config) platform.quick_mode = args.quick platform.no_user_change = args.no_user_change platform.no_tickless_check = args.no_tickless_check platform.no_pstate_check = args.no_pstate_check platform.hardware_reboots = args.hardware_reboots # Create the instrumentation directory if required if on_first_invocation: # We only want make a dir if >=1 VM is in instrumentation mode. for vm in config.VMS.itervalues(): if vm['vm_def'].instrument: util.make_instr_dir(config) break debug("Checking platform preliminaries") platform.check_preliminaries() # Make a bit of noise if this is a virtualised environment if platform.is_virtual(): warn("This appears to be a virtualised host. The results will be flawed. " "Use bare-metal for reliable results!") platform.collect_audit() # At this point the config file is OK, and on-disk state is consistent, # so let's daemonise (if requested). if args.daemonise: util.daemonise() if not on_first_invocation: # output file must exist, due to check above assert(out_file_exists) debug("Using pre-recorded initial temperature readings") manifest = ManifestManager(config, platform) platform_temps = {} for sensor, tup in manifest.starting_temperatures.iteritems(): platform_temps[sensor] = tup[1] platform.starting_temperatures = platform_temps else: manifest = ManifestManager(config, platform, new_file=True) if manifest.num_execs_left == 0: # No executions, or all skipped fatal("Empty schedule!") try: info(("Wait %s secs to allow system to cool prior to " "collecting initial temperature readings") % config.TEMP_READ_PAUSE) # This part is wrapped in hooks, so that if daemons or networking are # taken down for process executions, then the initial temperature # reading gets the same treatment. util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS,) platform.sleep(config.TEMP_READ_PAUSE) debug("Taking fresh initial temperature readings") platform.starting_temperatures = platform.take_temperature_readings() manifest.set_starting_temperatures(platform.starting_temperatures) # Write out an empty results file. After the initial reboot Krun # will expect this to exist. Results.ok_to_instantiate = True results = Results(config, platform) results.write_to_file() except: raise finally: util.run_shell_cmd_list(config.POST_EXECUTION_CMDS,) log_path = config.log_filename(resume=False) util.log_and_mail(mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % log_path, bypass_limiter=True) util.reboot(manifest, platform) # Assign platform to VM defs -- needs to happen early for sanity checks util.assign_platform(config, platform) sanity_checks(config, platform) # Build job queue -- each job is an execution sched = ExecutionScheduler(config, mailer, platform, dry_run=args.dry_run) sched.run()