Пример #1
0
    def __init__(self, path):
        self.path = path

        job_path = os.path.join(path, 'results.zip')
        if not os.path.exists(job_path):
            job_path = os.path.join(path, 'orig.zip')
            assert os.path.exists(job_path)

        self.job = ReadOnlyJob(job_path)
Пример #2
0
    def _checkpoint(self, i):
        print("Fetching results of step {} at: ".format(i))
        print(datetime.datetime.now())

        for i, host in enumerate(self.hosts):
            if host == ':':
                command = "mv {local_scratch}/experiments/* ./experiments"
                self.execute_command(command, robust=True)

                command = "rm -rf {local_scratch}/experiments"
                self.execute_command(command, robust=True)

                command = "cp -ru {local_scratch}/{archive_root} ."
                self.execute_command(command, robust=True)
            else:
                command = (
                    "rsync -az {rsync_verbosity} --timeout=300 -e \"ssh {ssh_options}\" "
                    "{host}:{local_scratch}/experiments/ ./experiments".format(
                        host=host, **self.__dict__))
                self.execute_command(command,
                                     frmt=False,
                                     robust=True,
                                     output="loud")

                command = "rm -rf {local_scratch}/experiments"
                self.ssh_execute(command, host, robust=True, output="loud")

                command = (
                    "rsync -az {rsync_verbosity} --timeout=300 -e \"ssh {ssh_options}\" "
                    "{host}:{local_scratch}/{archive_root} .".format(
                        host=host, **self.__dict__))
                self.execute_command(command,
                                     frmt=False,
                                     robust=True,
                                     output="loud")

        self.execute_command("zip -rq results {archive_root}", robust=True)

        try:
            from dps.hyper import HyperSearch
            search = HyperSearch('.')
            with redirect_stream('stdout', 'results.txt', tee=False):
                search.print_summary(print_config=False, verbose=False)
            print(search.job.summary(verbose=False))
        except Exception:
            job_path = 'results.zip' if os.path.exists(
                'results.zip') else 'orig.zip'
            assert os.path.exists(job_path)
            job = ReadOnlyJob(job_path)
            print(job.summary(verbose=False))
Пример #3
0
    def __init__(
            self, name, input_zip, pattern, scratch, local_scratch_prefix='/tmp/dps/hyper/',
            n_nodes=1, tasks_per_node=12, cpus_per_task=1, mem_per_cpu="", gpu_set="",
            wall_time="1hour", cleanup_time="1min", slack_time="1min",
            add_date=True, dry_run=0, kind="slurm", env_vars=None, output_to_files=True, n_retries=0,
            copy_venv="", step_time_limit=None, ignore_gpu=False, ssh_options=None,
            loud_output=True, rsync_verbosity=0, copy_locally=True):

        args = locals().copy()
        del args['self']

        print("\nParallelSession args:")
        print(args)

        launch_venv = os.getenv('VIRTUAL_ENV')
        if launch_venv:
            launch_venv = os.path.split(launch_venv)[1]

        if ssh_options is None:
            ssh_options = (
                "-oPasswordAuthentication=no "
                "-oStrictHostKeyChecking=no "
                "-oConnectTimeout=5 "
                "-oServerAliveInterval=2"
            )

        if kind == "pbs":
            local_scratch_prefix = "\\$RAMDISK"

        assert kind in "slurm slurm-local".split()

        # Create directory to run the job from - should be on scratch.
        scratch = os.path.abspath(os.path.expandvars(scratch))

        es = ExperimentStore(scratch, prefix="run")

        job_dir = es.new_experiment(name, 0, add_date=add_date, force_fresh=1)
        job_dir.record_environment()

        with open(job_dir.path_for('run_kwargs.json'), 'w') as f:
            json.dump(args, f, default=str, indent=4, sort_keys=True)
        del f
        del args

        job_path = job_dir.path
        job_dir.make_directory('experiments')

        input_zip_stem = path_stem(input_zip)
        input_zip = shutil.copy(input_zip, job_dir.path_for("orig.zip"))
        input_zip_abs = process_path(input_zip)
        input_zip_base = os.path.basename(input_zip)
        archive_root = zip_root(input_zip)

        self.copy_files(
            job_dir, input_zip, archive_root,
            ["README.md", "sampled_configs.txt", "config.json", "config.pkl"])

        # storage local to each node, from the perspective of that node
        local_scratch = os.path.join(local_scratch_prefix, os.path.basename(job_path))

        output_to_files = "--output-to-files" if output_to_files else ""

        env = os.environ.copy()

        env_vars = env_vars or {}

        env.update({e: str(v) for e, v in env_vars.items()})
        env_vars = ' '.join('--env ' + k for k in env_vars)

        rsync_verbosity = "" if not rsync_verbosity else "-" + "v" * rsync_verbosity

        ro_job = ReadOnlyJob(input_zip)
        indices_to_run = sorted([op.idx for op in ro_job.ready_incomplete_ops(sort=False)])
        del ro_job
        n_jobs_to_run = len(indices_to_run)
        if n_jobs_to_run == 0:
            print("All jobs are finished! Exiting.")
            return

        dirty_hosts = set()

        n_tasks_per_step = n_nodes * tasks_per_node
        n_steps = int(np.ceil(n_jobs_to_run / n_tasks_per_step))

        node_file = " --sshloginfile nodefile.txt "

        wall_time_seconds, total_seconds_per_step, parallel_seconds_per_step, python_seconds_per_step = \
            self.compute_time_limits(wall_time, cleanup_time, slack_time, step_time_limit, n_steps)

        self.__dict__.update(locals())

        self.print_time_limits()
Пример #4
0
    def run(self):
        if self.dry_run:
            print("Dry run, so not running.")
            return

        # Have to jump through a hoop to get the proper node-local storage on cedar/graham.
        self.local_scratch_prefix = self.get_slurm_var("SLURM_TMPDIR")
        self.local_scratch = os.path.join(
            self.local_scratch_prefix,
            os.path.basename(self.job_path))

        # Compute new time limits based on the actual time remaining (protect against e.g. job starting late)

        print("Time limits before adjustment:")
        self.print_time_limits()

        job_id = os.getenv("SLURM_JOBID")
        command = 'squeue -h -j {} -o "%L"'.format(job_id)
        returncode, stdout, stderr = self.execute_command(command, frmt=False, robust=False)
        days = 0
        if "-" in stdout:
            days, time = stdout.split("-")
            days = int(days)
        else:
            time = stdout

        time = time.split(":")

        hours = int(time[-3]) if len(time) > 2 else 0
        minutes = int(time[-2]) if len(time) > 1 else 0
        seconds = int(time[-1])

        wall_time_delta = datetime.timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
        wall_time_seconds = int(wall_time_delta.total_seconds())

        print("Actual remaining walltime: {}".format(wall_time_delta))
        print("Time limits after adjustment:")

        (self.wall_time_seconds, self.total_seconds_per_step,
         self.parallel_seconds_per_step, self.python_seconds_per_step) = \
            self.compute_time_limits(
                wall_time_seconds, self.cleanup_time, self.slack_time, self.step_time_limit, self.n_steps)

        self.print_time_limits()

        with cd(self.job_path):
            print("\n" + ("=" * 80))
            job_start = datetime.datetime.now()
            print("Starting job at {}".format(job_start))

            job = ReadOnlyJob(self.input_zip)
            subjobs_remaining = sorted([op.idx for op in job.ready_incomplete_ops(sort=False)])

            n_failures = defaultdict(int)
            dead_jobs = set()

            i = 0
            while subjobs_remaining:
                step_start = datetime.datetime.now()

                print("\nStarting step {} at: ".format(i) + "=" * 90)
                print("{} ({} since start of job)".format(step_start, step_start - job_start))

                p = subprocess.run(
                    'scontrol show hostnames $SLURM_JOB_NODELIST', stdout=subprocess.PIPE, shell=True)
                host_pool = list(set([host.strip() for host in p.stdout.decode().split('\n') if host]))

                self.hosts, n_tasks_for_step = self.recruit_hosts(
                    host_pool, self.tasks_per_node, max_tasks=len(subjobs_remaining))

                indices_for_step = subjobs_remaining[:n_tasks_for_step]
                self._step(i, indices_for_step)
                self._checkpoint(i)

                job = ReadOnlyJob(self.archive_root)

                subjobs_remaining = set([op.idx for op in job.ready_incomplete_ops(sort=False)])

                for j in indices_for_step:
                    if j in subjobs_remaining:
                        n_failures[j] += 1
                        if n_failures[j] > self.n_retries:
                            print("All {} attempts at completing job with index {} have failed, "
                                  "permanently removing it from set of eligible jobs.".format(n_failures[j], j))
                            dead_jobs.add(j)

                subjobs_remaining = [idx for idx in subjobs_remaining if idx not in dead_jobs]
                subjobs_remaining = sorted(subjobs_remaining)

                i += 1

                print("Step duration: {}.".format(datetime.datetime.now() - step_start))

            self.execute_command("rm -rf {archive_root}", robust=True)

        print("Cleaning up dirty hosts...")
        command = "rm -rf {local_scratch}"
        for host in self.dirty_hosts:
            print("Cleaning host {}...".format(host))
            self.ssh_execute(command, host, robust=True)
Пример #5
0
    def __init__(
            self, name, input_zip, pattern, scratch, local_scratch_prefix='/tmp/dps/hyper/', ppn=12, cpp=1,
            pmem=None, wall_time="1hour", cleanup_time="1min", slack_time="1min", add_date=True, dry_run=0,
            parallel_exe=None, kind="parallel", host_pool=None, load_avg_threshold=8., min_hosts=None,
            max_hosts=1, env_vars=None, output_to_files=True, n_retries=0, gpu_set="", copy_venv="",
            python_startup=False, step_time_limit=None, ignore_gpu=False, ssh_options=None, loud_output=True,
            rsync_verbosity=0):

        args = locals().copy()
        del args['self']

        print("\nParallelSession args:")
        print(args)

        launch_venv = os.getenv('VIRTUAL_ENV')
        if launch_venv:
            launch_venv = os.path.split(launch_venv)[1]

        if not parallel_exe:
            parallel_exe = "$HOME/.local/bin/parallel"

        if ssh_options is None:
            ssh_options = (
                "-oPasswordAuthentication=no "
                "-oStrictHostKeyChecking=no "
                "-oConnectTimeout=5 "
                "-oServerAliveInterval=2"
            )

        if kind == "pbs":
            local_scratch_prefix = "\\$RAMDISK"

        assert kind in "parallel pbs slurm slurm-local".split()
        hpc = kind != "parallel"

        # Create directory to run the job from - should be on scratch.
        scratch = os.path.abspath(os.path.expandvars(scratch))

        es = ExperimentStore(scratch, prefix="run_search")

        job_dir = es.new_experiment(name, 0, add_date=add_date, force_fresh=1)
        job_dir.record_environment()

        with open(job_dir.path_for('run_kwargs.json'), 'w') as f:
            json.dump(args, f, default=str, indent=4, sort_keys=True)
        del f
        del args

        job_path = job_dir.path
        job_dir.make_directory('experiments')

        input_zip_stem = path_stem(input_zip)
        input_zip = shutil.copy(input_zip, job_dir.path_for("orig.zip"))
        input_zip_abs = process_path(input_zip)
        input_zip_base = os.path.basename(input_zip)
        archive_root = zip_root(input_zip)

        self.copy_files(
            job_dir, input_zip, archive_root,
            ["README.md", "sampled_configs.txt", "config.json", "config.pkl"])

        # storage local to each node, from the perspective of that node
        local_scratch = os.path.join(local_scratch_prefix, os.path.basename(job_path))

        output_to_files = "--output-to-files" if output_to_files else ""

        env = os.environ.copy()

        env_vars = env_vars or {}

        env.update({e: str(v) for e, v in env_vars.items()})
        env_vars = ' '.join('--env ' + k for k in env_vars)

        rsync_verbosity = "" if not rsync_verbosity else "-" + "v" * rsync_verbosity

        ro_job = ReadOnlyJob(input_zip)
        indices_to_run = sorted([op.idx for op in ro_job.ready_incomplete_ops(sort=False)])
        del ro_job
        n_jobs_to_run = len(indices_to_run)
        if n_jobs_to_run == 0:
            print("All jobs are finished! Exiting.")
            return

        dirty_hosts = set()

        if hpc:
            host_pool = []
            n_nodes = max_hosts
            n_procs = n_nodes * ppn
            n_steps = int(np.ceil(n_jobs_to_run / n_procs))
        else:
            self.__dict__.update(locals())

            host_pool = host_pool or DEFAULT_HOST_POOL
            if isinstance(host_pool, str):
                host_pool = host_pool.split()

            # Get an estimate of the number of hosts we'll have available.
            with cd(job_path):
                hosts, n_procs = self.recruit_hosts(
                    hpc, min_hosts, max_hosts, host_pool,
                    ppn, max_procs=np.inf)
            n_nodes = len(hosts)

            if n_jobs_to_run < n_procs:
                n_steps = 1
                n_nodes = int(np.ceil(n_jobs_to_run / ppn))
                n_procs = n_nodes * ppn
                hosts = hosts[:n_nodes]
            else:
                n_steps = int(np.ceil(n_jobs_to_run / n_procs))

        node_file = " --sshloginfile nodefile.txt "

        wall_time_seconds, total_seconds_per_step, parallel_seconds_per_step, python_seconds_per_step = \
            self.compute_time_limits(wall_time, cleanup_time, slack_time, step_time_limit, n_steps)

        self.__dict__.update(locals())

        self.print_time_limits()
Пример #6
0
def view_command(path, verbose):
    """ Implements the `view` sub-command, which prints a summary of a job. """
    job = ReadOnlyJob(path)
    print(job.summary(verbose=verbose))
Пример #7
0
class HyperSearch(object):
    """ Interface to a directory storing a hyper-parameter search.

    Approximately a `frozen`, read-only handle for a directoy created by ParallelSession.

    """
    def __init__(self, path):
        self.path = path

        job_path = os.path.join(path, 'results.zip')
        if not os.path.exists(job_path):
            job_path = os.path.join(path, 'orig.zip')
            assert os.path.exists(job_path)

        self.job = ReadOnlyJob(job_path)

    @property
    def objects(self):
        return self.job.objects

    def dist_keys(self):
        """ The keys that were searched over. """
        distributions = self.objects.load_object('metadata', 'distributions')
        if isinstance(distributions, list):
            keys = set()
            for d in distributions:
                keys |= set(d.keys())
            keys = list(keys)
        else:
            distributions = Config(distributions)
            keys = list(distributions.keys())
        keys.append('idx')

        return sorted(set(keys))

    def dist(self):
        return self.objects.load_object('metadata', 'distributions')

    def sampled_configs(self):
        pass

    @property
    def experiment_paths(self):
        experiments_dir = os.path.join(self.path, 'experiments')
        exp_dirs = os.listdir(experiments_dir)
        return [os.path.join(experiments_dir, ed) for ed in exp_dirs]

    def extract_stage_data(self, fields=None, bare=False):
        """ Extract stage-by-stage data about the training runs.

        Parameters
        ----------
        bare: boolean
            If True, only returns the data. Otherwise, additionally returns the stage-by-stage config and meta-data.

        Returns
        -------
        A nested data structure containing the requested data.

        {param-setting-key: {(repeat, seed): (df, sc, md)

        where:
            df is a pandas DataFrame
            sc is a list giving the config for each stage
            md is a dictionary storing metadata

        """
        stage_data = defaultdict(dict)
        if isinstance(fields, str):
            fields = fields.split()

        config_keys = self.dist_keys()

        KeyTuple = namedtuple(self.__class__.__name__ + "Key", config_keys)

        for exp_path in self.experiment_paths:
            try:
                exp_data = FrozenTrainingLoopData(exp_path)

                md = {}
                md['host'] = exp_data.host
                for k in config_keys:
                    md[k] = exp_data.get_config_value(k)

                sc = []
                records = []
                for stage in exp_data.history:
                    record = stage.copy()

                    stage_config = record['stage_config'].copy()
                    sc.append(stage_config)
                    del record['stage_config']

                    record = AttrDict(record).flatten()

                    if 'best_path' in record:
                        del record['best_path']
                    if 'final_path' in record:
                        del record['final_path']

                    # Fix and filter keys
                    _record = {}
                    for k, v in record.items():
                        if k.startswith("best_"):
                            k = k[5:]

                        if (fields and k in fields) or not fields:
                            _record[k] = v

                    records.append(_record)

                key = KeyTuple(*(exp_data.get_config_value(k)
                                 for k in config_keys))

                repeat = exp_data.get_config_value("repeat")
                seed = exp_data.get_config_value("seed")

                if bare:
                    stage_data[key][(
                        repeat, seed)] = pd.DataFrame.from_records(records)
                else:
                    stage_data[key][(repeat, seed)] = (
                        pd.DataFrame.from_records(records), sc, md)

            except Exception:
                print(
                    "Exception raised while extracting stage data for path: {}"
                    .format(exp_path))
                traceback.print_exc()

        return stage_data

    def extract_step_data(self, mode, fields=None, stage=None):
        """ Extract per-step data across all experiments.

        Parameters
        ----------
        mode: str
            Data-collection mode to extract data from.
        fields: str
            Names of fields to extract data for. If not supplied, data for all
            fields is returned.
        stage: int or slice or tuple
            Specification of the stages to collect data for. If not supplied, data
            from all stages is returned.

        Returns
        -------

        A nested data structure containing the requested data.

        {param-setting-key: {(repeat, seed): pd.DataFrame()}}

        """
        step_data = defaultdict(dict)
        if isinstance(fields, str):
            fields = fields.split()

        config_keys = self.dist_keys()

        KeyTuple = namedtuple(self.__class__.__name__ + "Key", config_keys)

        for exp_path in self.experiment_paths:
            exp_data = FrozenTrainingLoopData(exp_path)

            _step_data = exp_data.step_data(mode, stage)

            if fields:
                try:
                    _step_data = _step_data[fields]
                except KeyError:
                    print("Valid keys are: {}".format(_step_data.keys()))
                    raise

            key = KeyTuple(*(exp_data.get_config_value(k)
                             for k in config_keys))

            repeat = exp_data.get_config_value("repeat")
            seed = exp_data.get_config_value("seed")

            step_data[key][(repeat, seed)] = _step_data

        return step_data

    def print_summary(self,
                      print_config=True,
                      verbose=False,
                      criteria=None,
                      maximize=False):
        """ Get all completed ops, get their outputs. Summarize em. """

        print("Summarizing search stored at {}.".format(
            os.path.realpath(self.path)))

        criteria_key = criteria if criteria else "stopping_criteria"
        if not criteria:
            config = self.objects.load_object('metadata', 'config')
            criteria_key, max_str = config['stopping_criteria'].split(',')
            maximize = max_str == "max"

        keys = self.dist_keys()
        stage_data = self.extract_stage_data()

        best = []
        all_keys = set()

        # For each parameter setting, identify the stage where it got the lowest/highest value for `criteria_key`.
        for i, (key, value) in enumerate(sorted(stage_data.items())):

            _best = []

            for (repeat, seed), (df, sc, md) in value.items():
                try:
                    idx = df[criteria_key].idxmax(
                    ) if maximize else df[criteria_key].idxmin()
                except KeyError:
                    idx = -1

                record = dict(df.iloc[idx])
                if criteria_key not in record:
                    record[criteria_key] = -np.inf if maximize else np.inf
                for k in keys:
                    record[k] = md[k]

                all_keys |= record.keys()

                _best.append(record)

            _best = pd.DataFrame.from_records(_best)
            _best = _best.sort_values(criteria_key)
            sc = _best[criteria_key].mean()
            best.append((sc, _best))

        best = sorted(best, reverse=not maximize, key=lambda x: x[0])
        best = [df for _, df in best]

        _column_order = [criteria_key, 'seed', 'reason', 'n_steps', 'host']
        column_order = [c for c in _column_order if c in all_keys]
        remaining = [
            k for k in all_keys if k not in column_order and k not in keys
        ]
        column_order = column_order + sorted(remaining)

        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):
            print('\n' + '*' * 100)
            direction = "DECREASING" if not maximize else "INCREASING"
            print(
                "RESULTS GROUPED BY PARAM VALUES, ORDER OF {} VALUE OF <{}>: ".
                format(direction, criteria_key))

            for i, b in enumerate(best):
                print('\n {}-th {} '.format(
                    len(best) - i, "lowest" if not maximize else "highest") +
                      '*' * 40)
                pprint({k: b[k].iloc[0] for k in keys})
                _column_order = [c for c in column_order if c in b.keys()]
                b = b[_column_order]
                with_stats = pd.merge(b.transpose(),
                                      b.describe().transpose(),
                                      left_index=True,
                                      right_index=True,
                                      how='outer')

                profile_rows = [
                    k for k in with_stats.index
                    if 'time' in k or 'duration' in k or 'memory' in k
                ]
                other_rows = [
                    k for k in with_stats.index if k not in profile_rows
                ]

                print(
                    tabulate(with_stats.loc[profile_rows],
                             headers='keys',
                             tablefmt='fancy_grid'))
                print(
                    tabulate(with_stats.loc[other_rows],
                             headers='keys',
                             tablefmt='fancy_grid'))

        if print_config:
            print('\n' + '*' * 100)
            print("BASE CONFIG")
            print(self.objects.load_object('metadata', 'config'))

            print('\n' + '*' * 100)
            print("PARAMETER DISTRIBUTION")
            pprint(self.dist())

        print(self.job.summary(verbose=verbose))