def run(plan, progress_file=None): """Runs experiments. In newest versions of this class the **plan** array must contain experiments with computed variables. :param list plan: List of experiments to perform. """ # Count number of active experiments in the plan num_active_experiments = 0 for experiment in plan: if 'exp.status' in experiment and experiment[ 'exp.status'] != 'disabled': num_active_experiments += 1 num_experiments = len(plan) start_time = datetime.datetime.now() stats = { "launcher.total_experiments": num_experiments, "launcher.active_experiments": num_active_experiments, "launcher.failed_experiments": 0, "launcher.skipped_experiments": 0, "launcher.disabled_experiments": 0, "launcher.start_time": str(start_time) } # See if resource monitor needs to be run. Now, the assumption is that # if it's enabled for a first experiments ,it's enabled for all others. resource_monitor = None if num_experiments > 0 and 'monitor.frequency' in plan[ 0] and plan[0]['monitor.frequency'] > 0: if not os.path.isdir(plan[0]['monitor.pid_folder']): os.makedirs(plan[0]['monitor.pid_folder']) resource_monitor = ResourceMonitor(plan[0]['monitor.launcher'], plan[0]['monitor.pid_folder'], plan[0]['monitor.frequency'], plan[0]['monitor.timeseries']) # The file must be created beforehand - this is required for docker to # to keep correct access rights. resource_monitor.empty_pid_file() resource_monitor.run() # It's used for reporting progress to a user progress_reporter = ProgressReporter(num_experiments, num_active_experiments, progress_file) # Setting handler for SIGUSR1 signal. Users can send this signal to this # script to gracefully terminate benchmarking process. print("--------------------------------------------------------------") print("Experimenter pid %d. Run this to gracefully terminate me:" % os.getpid()) print("\tkill -USR1 %d" % os.getpid()) print("I will terminate myself as soon as current benchmark finishes.") print("--------------------------------------------------------------") sys.stdout.flush() Launcher.must_exit = False def _sigusr1_handler(signum, frame): Launcher.must_exit = True signal.signal(signal.SIGUSR1, _sigusr1_handler) num_completed_experiments = 0 for idx in range(num_experiments): if Launcher.must_exit: logging.warn( "The SIGUSR1 signal has been caught, gracefully shutting down benchmarking process on experiment %d (out of %d)", idx, num_experiments) break experiment = plan[idx] # Is experiment disabled? if 'exp.status' in experiment and experiment[ 'exp.status'] == 'disabled': logging.info("Disabling experiment, exp.disabled is true") stats['launcher.disabled_experiments'] += 1 progress_reporter.report(experiment['exp.log_file'], 'disabled', counts=False) continue # If experiments have been ran, check if we need to re-run. if 'exp.log_file' in experiment and experiment['exp.log_file']: if isfile(experiment['exp.log_file'] ) and not Launcher.force_redo(experiment): logging.info("Skipping experiment, file (%s) exists", experiment['exp.log_file']) stats['launcher.skipped_experiments'] += 1 progress_reporter.report(experiment['exp.log_file'], 'skipped', counts=True) continue # Track current progress progress_reporter.report_active(experiment['exp.log_file']) # Get script that runs experiment for this framework. If no 'framework_family' is # found, we can try to use exp.framework. framework_key = 'exp.framework_family' if framework_key not in experiment: framework_key = 'exp.framework' command = [experiment['%s.launcher' % (experiment[framework_key])]] # Do we need to manipulate arguments for launching process? launcher_args_key = '%s.launcher_args' % experiment[framework_key] if launcher_args_key in experiment: launcher_args = set(experiment[launcher_args_key].split(' ')) logging.debug( 'Only these arguments will be passed to laucnhing process (%s): %s', command[0], str(launcher_args)) else: launcher_args = None for param, param_val in experiment.items(): if launcher_args is not None and param not in launcher_args: continue assert not isinstance(param_val, list),\ "Here, this must not be the list but (%s=%s)" % (param, str(param_val)) if not isinstance(param_val, bool): command.extend( ['--%s' % (param.replace('.', '_')), str(param_val)]) else: command.extend([ '--%s' % (param.replace('.', '_')), ('true' if param_val else 'false') ]) # Prepare environmental variables env_vars = copy.deepcopy(os.environ) env_vars.update( DictUtils.filter_by_key_prefix(experiment, 'runtime.env.', remove_prefix=True)) # Run experiment in background and wait for complete worker = Worker(command, env_vars, experiment) worker.work(resource_monitor) if worker.ret_code != 0: stats['launcher.failed_experiments'] += 1 num_completed_experiments += 1 # Print progress if num_completed_experiments % 10 == 0: print("Done %d benchmarks out of %d" % (num_completed_experiments, num_active_experiments)) progress_reporter.report_active_completed() end_time = datetime.datetime.now() stats['launcher.end_time'] = str(end_time) stats['launcher.hours'] = (end_time - start_time).total_seconds() / 3600 if resource_monitor is not None: resource_monitor.stop() for key, val in stats.items(): print('__%s__=%s' % (key, json.dumps(val))) progress_reporter.report_all_completed()
def run(plan, progress_file=None): """Runs experiments in `plan` one at a time. In newest versions of this class the `plan` array must contain experiments with computed variables. Args: plan (list): List of benchmarks to perform (list of dictionaries). progress_file (str): A file for a progress reporter. If None, no progress will be reported. """ num_experiments = len(plan) # See if resource monitor needs to be run. Now, the assumption is that # if it's enabled for a first experiments ,it's enabled for all others. resource_monitor = None if num_experiments > 0 and DictUtils.get(plan[0], 'monitor.frequency', 0) > 0: if not os.path.isdir(plan[0]['monitor.pid_folder']): os.makedirs(plan[0]['monitor.pid_folder']) resource_monitor = ResourceMonitor(plan[0]['monitor.launcher'], plan[0]['monitor.pid_folder'], plan[0]['monitor.frequency'], plan[0]['monitor.timeseries']) # The file must be created beforehand - this is required for docker to # to keep correct access rights. resource_monitor.empty_pid_file() resource_monitor.run() # It's used for reporting progress to a user num_active_experiments = 0 for experiment in plan: if DictUtils.get(experiment, 'exp.status', '') not in ['disabled', 'inactive']: num_active_experiments += 1 progress_tracker = ProgressTracker(num_experiments, num_active_experiments, progress_file) # Setting handler for SIGUSR1 signal. Users can send this signal to this # script to gracefully terminate benchmarking process. print("--------------------------------------------------------------") print("Experimenter pid %d. Run this to gracefully terminate me:" % os.getpid()) print("\tkill -USR1 %d" % os.getpid()) print("I will terminate myself as soon as current benchmark finishes.") print("--------------------------------------------------------------") sys.stdout.flush() Launcher.must_exit = False def _sigusr1_handler(signum, frame): Launcher.must_exit = True signal.signal(signal.SIGUSR1, _sigusr1_handler) for idx in range(num_experiments): if Launcher.must_exit: logging.warn( "The SIGUSR1 signal has been caught, gracefully shutting down benchmarking " "process on experiment %d (out of %d)", idx, num_experiments) break experiment = plan[idx] # Is experiment disabled? if DictUtils.get(experiment, 'exp.status', '') in ('disabled', 'inactive'): logging.info( "Will not run benchmark, reason: exp.status='%s'" % experiment['exp.status']) progress_tracker.report(experiment['exp.log_file'], exec_status='inactive') continue # If experiments have been ran, check if we need to re-run. if DictUtils.get(experiment, 'exp.log_file', None) is not None: if isfile(experiment['exp.log_file']): bench_status = None no_rerun_msg = None rerun_condition = DictUtils.get(experiment, 'exp.rerun', 'never') if rerun_condition == 'never': no_rerun_msg = "Will not run benchmark, reason: log file exists, exp.rerun='never'" elif rerun_condition == 'onfail': bench_status = BenchData.status( experiment['exp.log_file']) if bench_status == 'ok': no_rerun_msg = "Will not run benchmark, reason: log file exists, exp.status='ok', "\ "exp.rerun='onfail'" if no_rerun_msg is not None: logging.info(no_rerun_msg) progress_tracker.report(experiment['exp.log_file'], exec_status='skipped', bench_status=bench_status) continue # Track current progress progress_tracker.report_active( DictUtils.get(experiment, 'exp.log_file', '<none>')) # Get script that runs experiment for this framework. If no 'framework_family' is # found, we can try to use exp.framework. framework_key = 'exp.framework_family' if framework_key not in experiment: framework_key = 'exp.framework' command = [experiment['%s.launcher' % (experiment[framework_key])]] # Do we need to manipulate arguments for launching process? launcher_args_key = '%s.launcher_args' % experiment[framework_key] if launcher_args_key in experiment: launcher_args = set(experiment[launcher_args_key].split(' ')) logging.debug( 'Only these arguments will be passed to launching process (%s): %s', command[0], str(launcher_args)) else: launcher_args = None for param, param_val in experiment.items(): if launcher_args is not None and param not in launcher_args: continue if isinstance(param_val, list): raise ValueError( "Here, this must not be the list but (%s=%s)" % (param, str(param_val))) if not isinstance(param_val, bool): command.extend([ '--%s' % (param.replace('.', '_')), ParamUtils.to_string(param_val) ]) else: command.extend([ '--%s' % (param.replace('.', '_')), ('true' if param_val else 'false') ]) # Prepare environmental variables env_vars = copy.deepcopy(os.environ) env_vars.update( DictUtils.filter_by_key_prefix(experiment, 'runtime.env.', remove_prefix=True)) # Run experiment in background and wait for complete worker = Worker(command, env_vars, experiment) worker.work(resource_monitor) # Print progress progress_tracker.report(experiment['exp.log_file'], exec_status='completed') if progress_tracker.num_completed_benchmarks() % 10 == 0: print("Done %d benchmarks out of %d" % (progress_tracker.num_completed_benchmarks(), num_active_experiments)) sys.stdout.flush() # All benchmarks have been conducted. if resource_monitor is not None: resource_monitor.stop() progress_tracker.report_all_completed() progress_tracker.print_summary()