def __init__(self, path): self.path = path job_path = os.path.join(path, 'results.zip') if not os.path.exists(job_path): job_path = os.path.join(path, 'orig.zip') assert os.path.exists(job_path) self.job = ReadOnlyJob(job_path)
def _checkpoint(self, i): print("Fetching results of step {} at: ".format(i)) print(datetime.datetime.now()) for i, host in enumerate(self.hosts): if host == ':': command = "mv {local_scratch}/experiments/* ./experiments" self.execute_command(command, robust=True) command = "rm -rf {local_scratch}/experiments" self.execute_command(command, robust=True) command = "cp -ru {local_scratch}/{archive_root} ." self.execute_command(command, robust=True) else: command = ( "rsync -az {rsync_verbosity} --timeout=300 -e \"ssh {ssh_options}\" " "{host}:{local_scratch}/experiments/ ./experiments".format( host=host, **self.__dict__)) self.execute_command(command, frmt=False, robust=True, output="loud") command = "rm -rf {local_scratch}/experiments" self.ssh_execute(command, host, robust=True, output="loud") command = ( "rsync -az {rsync_verbosity} --timeout=300 -e \"ssh {ssh_options}\" " "{host}:{local_scratch}/{archive_root} .".format( host=host, **self.__dict__)) self.execute_command(command, frmt=False, robust=True, output="loud") self.execute_command("zip -rq results {archive_root}", robust=True) try: from dps.hyper import HyperSearch search = HyperSearch('.') with redirect_stream('stdout', 'results.txt', tee=False): search.print_summary(print_config=False, verbose=False) print(search.job.summary(verbose=False)) except Exception: job_path = 'results.zip' if os.path.exists( 'results.zip') else 'orig.zip' assert os.path.exists(job_path) job = ReadOnlyJob(job_path) print(job.summary(verbose=False))
def __init__( self, name, input_zip, pattern, scratch, local_scratch_prefix='/tmp/dps/hyper/', n_nodes=1, tasks_per_node=12, cpus_per_task=1, mem_per_cpu="", gpu_set="", wall_time="1hour", cleanup_time="1min", slack_time="1min", add_date=True, dry_run=0, kind="slurm", env_vars=None, output_to_files=True, n_retries=0, copy_venv="", step_time_limit=None, ignore_gpu=False, ssh_options=None, loud_output=True, rsync_verbosity=0, copy_locally=True): args = locals().copy() del args['self'] print("\nParallelSession args:") print(args) launch_venv = os.getenv('VIRTUAL_ENV') if launch_venv: launch_venv = os.path.split(launch_venv)[1] if ssh_options is None: ssh_options = ( "-oPasswordAuthentication=no " "-oStrictHostKeyChecking=no " "-oConnectTimeout=5 " "-oServerAliveInterval=2" ) if kind == "pbs": local_scratch_prefix = "\\$RAMDISK" assert kind in "slurm slurm-local".split() # Create directory to run the job from - should be on scratch. scratch = os.path.abspath(os.path.expandvars(scratch)) es = ExperimentStore(scratch, prefix="run") job_dir = es.new_experiment(name, 0, add_date=add_date, force_fresh=1) job_dir.record_environment() with open(job_dir.path_for('run_kwargs.json'), 'w') as f: json.dump(args, f, default=str, indent=4, sort_keys=True) del f del args job_path = job_dir.path job_dir.make_directory('experiments') input_zip_stem = path_stem(input_zip) input_zip = shutil.copy(input_zip, job_dir.path_for("orig.zip")) input_zip_abs = process_path(input_zip) input_zip_base = os.path.basename(input_zip) archive_root = zip_root(input_zip) self.copy_files( job_dir, input_zip, archive_root, ["README.md", "sampled_configs.txt", "config.json", "config.pkl"]) # storage local to each node, from the perspective of that node local_scratch = os.path.join(local_scratch_prefix, os.path.basename(job_path)) output_to_files = "--output-to-files" if output_to_files else "" env = os.environ.copy() env_vars = env_vars or {} env.update({e: str(v) for e, v in env_vars.items()}) env_vars = ' '.join('--env ' + k for k in env_vars) rsync_verbosity = "" if not rsync_verbosity else "-" + "v" * rsync_verbosity ro_job = ReadOnlyJob(input_zip) indices_to_run = sorted([op.idx for op in ro_job.ready_incomplete_ops(sort=False)]) del ro_job n_jobs_to_run = len(indices_to_run) if n_jobs_to_run == 0: print("All jobs are finished! Exiting.") return dirty_hosts = set() n_tasks_per_step = n_nodes * tasks_per_node n_steps = int(np.ceil(n_jobs_to_run / n_tasks_per_step)) node_file = " --sshloginfile nodefile.txt " wall_time_seconds, total_seconds_per_step, parallel_seconds_per_step, python_seconds_per_step = \ self.compute_time_limits(wall_time, cleanup_time, slack_time, step_time_limit, n_steps) self.__dict__.update(locals()) self.print_time_limits()
def run(self): if self.dry_run: print("Dry run, so not running.") return # Have to jump through a hoop to get the proper node-local storage on cedar/graham. self.local_scratch_prefix = self.get_slurm_var("SLURM_TMPDIR") self.local_scratch = os.path.join( self.local_scratch_prefix, os.path.basename(self.job_path)) # Compute new time limits based on the actual time remaining (protect against e.g. job starting late) print("Time limits before adjustment:") self.print_time_limits() job_id = os.getenv("SLURM_JOBID") command = 'squeue -h -j {} -o "%L"'.format(job_id) returncode, stdout, stderr = self.execute_command(command, frmt=False, robust=False) days = 0 if "-" in stdout: days, time = stdout.split("-") days = int(days) else: time = stdout time = time.split(":") hours = int(time[-3]) if len(time) > 2 else 0 minutes = int(time[-2]) if len(time) > 1 else 0 seconds = int(time[-1]) wall_time_delta = datetime.timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds) wall_time_seconds = int(wall_time_delta.total_seconds()) print("Actual remaining walltime: {}".format(wall_time_delta)) print("Time limits after adjustment:") (self.wall_time_seconds, self.total_seconds_per_step, self.parallel_seconds_per_step, self.python_seconds_per_step) = \ self.compute_time_limits( wall_time_seconds, self.cleanup_time, self.slack_time, self.step_time_limit, self.n_steps) self.print_time_limits() with cd(self.job_path): print("\n" + ("=" * 80)) job_start = datetime.datetime.now() print("Starting job at {}".format(job_start)) job = ReadOnlyJob(self.input_zip) subjobs_remaining = sorted([op.idx for op in job.ready_incomplete_ops(sort=False)]) n_failures = defaultdict(int) dead_jobs = set() i = 0 while subjobs_remaining: step_start = datetime.datetime.now() print("\nStarting step {} at: ".format(i) + "=" * 90) print("{} ({} since start of job)".format(step_start, step_start - job_start)) p = subprocess.run( 'scontrol show hostnames $SLURM_JOB_NODELIST', stdout=subprocess.PIPE, shell=True) host_pool = list(set([host.strip() for host in p.stdout.decode().split('\n') if host])) self.hosts, n_tasks_for_step = self.recruit_hosts( host_pool, self.tasks_per_node, max_tasks=len(subjobs_remaining)) indices_for_step = subjobs_remaining[:n_tasks_for_step] self._step(i, indices_for_step) self._checkpoint(i) job = ReadOnlyJob(self.archive_root) subjobs_remaining = set([op.idx for op in job.ready_incomplete_ops(sort=False)]) for j in indices_for_step: if j in subjobs_remaining: n_failures[j] += 1 if n_failures[j] > self.n_retries: print("All {} attempts at completing job with index {} have failed, " "permanently removing it from set of eligible jobs.".format(n_failures[j], j)) dead_jobs.add(j) subjobs_remaining = [idx for idx in subjobs_remaining if idx not in dead_jobs] subjobs_remaining = sorted(subjobs_remaining) i += 1 print("Step duration: {}.".format(datetime.datetime.now() - step_start)) self.execute_command("rm -rf {archive_root}", robust=True) print("Cleaning up dirty hosts...") command = "rm -rf {local_scratch}" for host in self.dirty_hosts: print("Cleaning host {}...".format(host)) self.ssh_execute(command, host, robust=True)
def __init__( self, name, input_zip, pattern, scratch, local_scratch_prefix='/tmp/dps/hyper/', ppn=12, cpp=1, pmem=None, wall_time="1hour", cleanup_time="1min", slack_time="1min", add_date=True, dry_run=0, parallel_exe=None, kind="parallel", host_pool=None, load_avg_threshold=8., min_hosts=None, max_hosts=1, env_vars=None, output_to_files=True, n_retries=0, gpu_set="", copy_venv="", python_startup=False, step_time_limit=None, ignore_gpu=False, ssh_options=None, loud_output=True, rsync_verbosity=0): args = locals().copy() del args['self'] print("\nParallelSession args:") print(args) launch_venv = os.getenv('VIRTUAL_ENV') if launch_venv: launch_venv = os.path.split(launch_venv)[1] if not parallel_exe: parallel_exe = "$HOME/.local/bin/parallel" if ssh_options is None: ssh_options = ( "-oPasswordAuthentication=no " "-oStrictHostKeyChecking=no " "-oConnectTimeout=5 " "-oServerAliveInterval=2" ) if kind == "pbs": local_scratch_prefix = "\\$RAMDISK" assert kind in "parallel pbs slurm slurm-local".split() hpc = kind != "parallel" # Create directory to run the job from - should be on scratch. scratch = os.path.abspath(os.path.expandvars(scratch)) es = ExperimentStore(scratch, prefix="run_search") job_dir = es.new_experiment(name, 0, add_date=add_date, force_fresh=1) job_dir.record_environment() with open(job_dir.path_for('run_kwargs.json'), 'w') as f: json.dump(args, f, default=str, indent=4, sort_keys=True) del f del args job_path = job_dir.path job_dir.make_directory('experiments') input_zip_stem = path_stem(input_zip) input_zip = shutil.copy(input_zip, job_dir.path_for("orig.zip")) input_zip_abs = process_path(input_zip) input_zip_base = os.path.basename(input_zip) archive_root = zip_root(input_zip) self.copy_files( job_dir, input_zip, archive_root, ["README.md", "sampled_configs.txt", "config.json", "config.pkl"]) # storage local to each node, from the perspective of that node local_scratch = os.path.join(local_scratch_prefix, os.path.basename(job_path)) output_to_files = "--output-to-files" if output_to_files else "" env = os.environ.copy() env_vars = env_vars or {} env.update({e: str(v) for e, v in env_vars.items()}) env_vars = ' '.join('--env ' + k for k in env_vars) rsync_verbosity = "" if not rsync_verbosity else "-" + "v" * rsync_verbosity ro_job = ReadOnlyJob(input_zip) indices_to_run = sorted([op.idx for op in ro_job.ready_incomplete_ops(sort=False)]) del ro_job n_jobs_to_run = len(indices_to_run) if n_jobs_to_run == 0: print("All jobs are finished! Exiting.") return dirty_hosts = set() if hpc: host_pool = [] n_nodes = max_hosts n_procs = n_nodes * ppn n_steps = int(np.ceil(n_jobs_to_run / n_procs)) else: self.__dict__.update(locals()) host_pool = host_pool or DEFAULT_HOST_POOL if isinstance(host_pool, str): host_pool = host_pool.split() # Get an estimate of the number of hosts we'll have available. with cd(job_path): hosts, n_procs = self.recruit_hosts( hpc, min_hosts, max_hosts, host_pool, ppn, max_procs=np.inf) n_nodes = len(hosts) if n_jobs_to_run < n_procs: n_steps = 1 n_nodes = int(np.ceil(n_jobs_to_run / ppn)) n_procs = n_nodes * ppn hosts = hosts[:n_nodes] else: n_steps = int(np.ceil(n_jobs_to_run / n_procs)) node_file = " --sshloginfile nodefile.txt " wall_time_seconds, total_seconds_per_step, parallel_seconds_per_step, python_seconds_per_step = \ self.compute_time_limits(wall_time, cleanup_time, slack_time, step_time_limit, n_steps) self.__dict__.update(locals()) self.print_time_limits()
def view_command(path, verbose): """ Implements the `view` sub-command, which prints a summary of a job. """ job = ReadOnlyJob(path) print(job.summary(verbose=verbose))
class HyperSearch(object): """ Interface to a directory storing a hyper-parameter search. Approximately a `frozen`, read-only handle for a directoy created by ParallelSession. """ def __init__(self, path): self.path = path job_path = os.path.join(path, 'results.zip') if not os.path.exists(job_path): job_path = os.path.join(path, 'orig.zip') assert os.path.exists(job_path) self.job = ReadOnlyJob(job_path) @property def objects(self): return self.job.objects def dist_keys(self): """ The keys that were searched over. """ distributions = self.objects.load_object('metadata', 'distributions') if isinstance(distributions, list): keys = set() for d in distributions: keys |= set(d.keys()) keys = list(keys) else: distributions = Config(distributions) keys = list(distributions.keys()) keys.append('idx') return sorted(set(keys)) def dist(self): return self.objects.load_object('metadata', 'distributions') def sampled_configs(self): pass @property def experiment_paths(self): experiments_dir = os.path.join(self.path, 'experiments') exp_dirs = os.listdir(experiments_dir) return [os.path.join(experiments_dir, ed) for ed in exp_dirs] def extract_stage_data(self, fields=None, bare=False): """ Extract stage-by-stage data about the training runs. Parameters ---------- bare: boolean If True, only returns the data. Otherwise, additionally returns the stage-by-stage config and meta-data. Returns ------- A nested data structure containing the requested data. {param-setting-key: {(repeat, seed): (df, sc, md) where: df is a pandas DataFrame sc is a list giving the config for each stage md is a dictionary storing metadata """ stage_data = defaultdict(dict) if isinstance(fields, str): fields = fields.split() config_keys = self.dist_keys() KeyTuple = namedtuple(self.__class__.__name__ + "Key", config_keys) for exp_path in self.experiment_paths: try: exp_data = FrozenTrainingLoopData(exp_path) md = {} md['host'] = exp_data.host for k in config_keys: md[k] = exp_data.get_config_value(k) sc = [] records = [] for stage in exp_data.history: record = stage.copy() stage_config = record['stage_config'].copy() sc.append(stage_config) del record['stage_config'] record = AttrDict(record).flatten() if 'best_path' in record: del record['best_path'] if 'final_path' in record: del record['final_path'] # Fix and filter keys _record = {} for k, v in record.items(): if k.startswith("best_"): k = k[5:] if (fields and k in fields) or not fields: _record[k] = v records.append(_record) key = KeyTuple(*(exp_data.get_config_value(k) for k in config_keys)) repeat = exp_data.get_config_value("repeat") seed = exp_data.get_config_value("seed") if bare: stage_data[key][( repeat, seed)] = pd.DataFrame.from_records(records) else: stage_data[key][(repeat, seed)] = ( pd.DataFrame.from_records(records), sc, md) except Exception: print( "Exception raised while extracting stage data for path: {}" .format(exp_path)) traceback.print_exc() return stage_data def extract_step_data(self, mode, fields=None, stage=None): """ Extract per-step data across all experiments. Parameters ---------- mode: str Data-collection mode to extract data from. fields: str Names of fields to extract data for. If not supplied, data for all fields is returned. stage: int or slice or tuple Specification of the stages to collect data for. If not supplied, data from all stages is returned. Returns ------- A nested data structure containing the requested data. {param-setting-key: {(repeat, seed): pd.DataFrame()}} """ step_data = defaultdict(dict) if isinstance(fields, str): fields = fields.split() config_keys = self.dist_keys() KeyTuple = namedtuple(self.__class__.__name__ + "Key", config_keys) for exp_path in self.experiment_paths: exp_data = FrozenTrainingLoopData(exp_path) _step_data = exp_data.step_data(mode, stage) if fields: try: _step_data = _step_data[fields] except KeyError: print("Valid keys are: {}".format(_step_data.keys())) raise key = KeyTuple(*(exp_data.get_config_value(k) for k in config_keys)) repeat = exp_data.get_config_value("repeat") seed = exp_data.get_config_value("seed") step_data[key][(repeat, seed)] = _step_data return step_data def print_summary(self, print_config=True, verbose=False, criteria=None, maximize=False): """ Get all completed ops, get their outputs. Summarize em. """ print("Summarizing search stored at {}.".format( os.path.realpath(self.path))) criteria_key = criteria if criteria else "stopping_criteria" if not criteria: config = self.objects.load_object('metadata', 'config') criteria_key, max_str = config['stopping_criteria'].split(',') maximize = max_str == "max" keys = self.dist_keys() stage_data = self.extract_stage_data() best = [] all_keys = set() # For each parameter setting, identify the stage where it got the lowest/highest value for `criteria_key`. for i, (key, value) in enumerate(sorted(stage_data.items())): _best = [] for (repeat, seed), (df, sc, md) in value.items(): try: idx = df[criteria_key].idxmax( ) if maximize else df[criteria_key].idxmin() except KeyError: idx = -1 record = dict(df.iloc[idx]) if criteria_key not in record: record[criteria_key] = -np.inf if maximize else np.inf for k in keys: record[k] = md[k] all_keys |= record.keys() _best.append(record) _best = pd.DataFrame.from_records(_best) _best = _best.sort_values(criteria_key) sc = _best[criteria_key].mean() best.append((sc, _best)) best = sorted(best, reverse=not maximize, key=lambda x: x[0]) best = [df for _, df in best] _column_order = [criteria_key, 'seed', 'reason', 'n_steps', 'host'] column_order = [c for c in _column_order if c in all_keys] remaining = [ k for k in all_keys if k not in column_order and k not in keys ] column_order = column_order + sorted(remaining) with pd.option_context('display.max_rows', None, 'display.max_columns', None): print('\n' + '*' * 100) direction = "DECREASING" if not maximize else "INCREASING" print( "RESULTS GROUPED BY PARAM VALUES, ORDER OF {} VALUE OF <{}>: ". format(direction, criteria_key)) for i, b in enumerate(best): print('\n {}-th {} '.format( len(best) - i, "lowest" if not maximize else "highest") + '*' * 40) pprint({k: b[k].iloc[0] for k in keys}) _column_order = [c for c in column_order if c in b.keys()] b = b[_column_order] with_stats = pd.merge(b.transpose(), b.describe().transpose(), left_index=True, right_index=True, how='outer') profile_rows = [ k for k in with_stats.index if 'time' in k or 'duration' in k or 'memory' in k ] other_rows = [ k for k in with_stats.index if k not in profile_rows ] print( tabulate(with_stats.loc[profile_rows], headers='keys', tablefmt='fancy_grid')) print( tabulate(with_stats.loc[other_rows], headers='keys', tablefmt='fancy_grid')) if print_config: print('\n' + '*' * 100) print("BASE CONFIG") print(self.objects.load_object('metadata', 'config')) print('\n' + '*' * 100) print("PARAMETER DISTRIBUTION") pprint(self.dist()) print(self.job.summary(verbose=verbose))