Exemplo n.º 1
0
    def __call__(self, inp):
        import os
        import datetime
        import dps
        from dps import cfg  # noqa
        from dps.config import DEFAULT_CONFIG
        from dps.utils import ExperimentStore
        os.nice(10)

        print("Entered _BuildDataset at: ")
        print(datetime.datetime.now())

        idx, seed, n_examples = inp
        print("idx: {}, seed: {}, n_examples: {}".format(idx, seed, n_examples))

        dps.reset_config()
        params = self.params.copy()
        params.update(seed=seed, n_examples=n_examples)

        with DEFAULT_CONFIG.copy():
            cfg.update_from_command_line()
            print(cfg)

            experiment_store = ExperimentStore(os.path.join(cfg.local_experiments_dir, cfg.env_name))
            exp_dir = experiment_store.new_experiment("", seed, add_date=1, force_fresh=1, update_latest=False)
            params["data_dir"] = exp_dir.path

            print(params)

            self.cls(**params)

        print("Leaving _BuildDataset at: ")
        print(datetime.datetime.now())
Exemplo n.º 2
0
    def __init__(
            self, name, input_zip, pattern, scratch, local_scratch_prefix='/tmp/dps/hyper/',
            n_nodes=1, tasks_per_node=12, cpus_per_task=1, mem_per_cpu="", gpu_set="",
            wall_time="1hour", cleanup_time="1min", slack_time="1min",
            add_date=True, dry_run=0, kind="slurm", env_vars=None, output_to_files=True, n_retries=0,
            copy_venv="", step_time_limit=None, ignore_gpu=False, ssh_options=None,
            loud_output=True, rsync_verbosity=0, copy_locally=True):

        args = locals().copy()
        del args['self']

        print("\nParallelSession args:")
        print(args)

        launch_venv = os.getenv('VIRTUAL_ENV')
        if launch_venv:
            launch_venv = os.path.split(launch_venv)[1]

        if ssh_options is None:
            ssh_options = (
                "-oPasswordAuthentication=no "
                "-oStrictHostKeyChecking=no "
                "-oConnectTimeout=5 "
                "-oServerAliveInterval=2"
            )

        if kind == "pbs":
            local_scratch_prefix = "\\$RAMDISK"

        assert kind in "slurm slurm-local".split()

        # Create directory to run the job from - should be on scratch.
        scratch = os.path.abspath(os.path.expandvars(scratch))

        es = ExperimentStore(scratch, prefix="run")

        job_dir = es.new_experiment(name, 0, add_date=add_date, force_fresh=1)
        job_dir.record_environment()

        with open(job_dir.path_for('run_kwargs.json'), 'w') as f:
            json.dump(args, f, default=str, indent=4, sort_keys=True)
        del f
        del args

        job_path = job_dir.path
        job_dir.make_directory('experiments')

        input_zip_stem = path_stem(input_zip)
        input_zip = shutil.copy(input_zip, job_dir.path_for("orig.zip"))
        input_zip_abs = process_path(input_zip)
        input_zip_base = os.path.basename(input_zip)
        archive_root = zip_root(input_zip)

        self.copy_files(
            job_dir, input_zip, archive_root,
            ["README.md", "sampled_configs.txt", "config.json", "config.pkl"])

        # storage local to each node, from the perspective of that node
        local_scratch = os.path.join(local_scratch_prefix, os.path.basename(job_path))

        output_to_files = "--output-to-files" if output_to_files else ""

        env = os.environ.copy()

        env_vars = env_vars or {}

        env.update({e: str(v) for e, v in env_vars.items()})
        env_vars = ' '.join('--env ' + k for k in env_vars)

        rsync_verbosity = "" if not rsync_verbosity else "-" + "v" * rsync_verbosity

        ro_job = ReadOnlyJob(input_zip)
        indices_to_run = sorted([op.idx for op in ro_job.ready_incomplete_ops(sort=False)])
        del ro_job
        n_jobs_to_run = len(indices_to_run)
        if n_jobs_to_run == 0:
            print("All jobs are finished! Exiting.")
            return

        dirty_hosts = set()

        n_tasks_per_step = n_nodes * tasks_per_node
        n_steps = int(np.ceil(n_jobs_to_run / n_tasks_per_step))

        node_file = " --sshloginfile nodefile.txt "

        wall_time_seconds, total_seconds_per_step, parallel_seconds_per_step, python_seconds_per_step = \
            self.compute_time_limits(wall_time, cleanup_time, slack_time, step_time_limit, n_steps)

        self.__dict__.update(locals())

        self.print_time_limits()
Exemplo n.º 3
0
    def __init__(
            self, name, input_zip, pattern, scratch, local_scratch_prefix='/tmp/dps/hyper/', ppn=12, cpp=1,
            pmem=None, wall_time="1hour", cleanup_time="1min", slack_time="1min", add_date=True, dry_run=0,
            parallel_exe=None, kind="parallel", host_pool=None, load_avg_threshold=8., min_hosts=None,
            max_hosts=1, env_vars=None, output_to_files=True, n_retries=0, gpu_set="", copy_venv="",
            python_startup=False, step_time_limit=None, ignore_gpu=False, ssh_options=None, loud_output=True,
            rsync_verbosity=0):

        args = locals().copy()
        del args['self']

        print("\nParallelSession args:")
        print(args)

        launch_venv = os.getenv('VIRTUAL_ENV')
        if launch_venv:
            launch_venv = os.path.split(launch_venv)[1]

        if not parallel_exe:
            parallel_exe = "$HOME/.local/bin/parallel"

        if ssh_options is None:
            ssh_options = (
                "-oPasswordAuthentication=no "
                "-oStrictHostKeyChecking=no "
                "-oConnectTimeout=5 "
                "-oServerAliveInterval=2"
            )

        if kind == "pbs":
            local_scratch_prefix = "\\$RAMDISK"

        assert kind in "parallel pbs slurm slurm-local".split()
        hpc = kind != "parallel"

        # Create directory to run the job from - should be on scratch.
        scratch = os.path.abspath(os.path.expandvars(scratch))

        es = ExperimentStore(scratch, prefix="run_search")

        job_dir = es.new_experiment(name, 0, add_date=add_date, force_fresh=1)
        job_dir.record_environment()

        with open(job_dir.path_for('run_kwargs.json'), 'w') as f:
            json.dump(args, f, default=str, indent=4, sort_keys=True)
        del f
        del args

        job_path = job_dir.path
        job_dir.make_directory('experiments')

        input_zip_stem = path_stem(input_zip)
        input_zip = shutil.copy(input_zip, job_dir.path_for("orig.zip"))
        input_zip_abs = process_path(input_zip)
        input_zip_base = os.path.basename(input_zip)
        archive_root = zip_root(input_zip)

        self.copy_files(
            job_dir, input_zip, archive_root,
            ["README.md", "sampled_configs.txt", "config.json", "config.pkl"])

        # storage local to each node, from the perspective of that node
        local_scratch = os.path.join(local_scratch_prefix, os.path.basename(job_path))

        output_to_files = "--output-to-files" if output_to_files else ""

        env = os.environ.copy()

        env_vars = env_vars or {}

        env.update({e: str(v) for e, v in env_vars.items()})
        env_vars = ' '.join('--env ' + k for k in env_vars)

        rsync_verbosity = "" if not rsync_verbosity else "-" + "v" * rsync_verbosity

        ro_job = ReadOnlyJob(input_zip)
        indices_to_run = sorted([op.idx for op in ro_job.ready_incomplete_ops(sort=False)])
        del ro_job
        n_jobs_to_run = len(indices_to_run)
        if n_jobs_to_run == 0:
            print("All jobs are finished! Exiting.")
            return

        dirty_hosts = set()

        if hpc:
            host_pool = []
            n_nodes = max_hosts
            n_procs = n_nodes * ppn
            n_steps = int(np.ceil(n_jobs_to_run / n_procs))
        else:
            self.__dict__.update(locals())

            host_pool = host_pool or DEFAULT_HOST_POOL
            if isinstance(host_pool, str):
                host_pool = host_pool.split()

            # Get an estimate of the number of hosts we'll have available.
            with cd(job_path):
                hosts, n_procs = self.recruit_hosts(
                    hpc, min_hosts, max_hosts, host_pool,
                    ppn, max_procs=np.inf)
            n_nodes = len(hosts)

            if n_jobs_to_run < n_procs:
                n_steps = 1
                n_nodes = int(np.ceil(n_jobs_to_run / ppn))
                n_procs = n_nodes * ppn
                hosts = hosts[:n_nodes]
            else:
                n_steps = int(np.ceil(n_jobs_to_run / n_procs))

        node_file = " --sshloginfile nodefile.txt "

        wall_time_seconds, total_seconds_per_step, parallel_seconds_per_step, python_seconds_per_step = \
            self.compute_time_limits(wall_time, cleanup_time, slack_time, step_time_limit, n_steps)

        self.__dict__.update(locals())

        self.print_time_limits()
Exemplo n.º 4
0
def build_search(path,
                 name,
                 distributions,
                 config,
                 n_repeats,
                 n_param_settings=None,
                 _zip=True,
                 add_date=0,
                 do_local_test=True,
                 readme=""):
    """ Create a job implementing a hyper-parameter search.

    Parameters
    ----------
    path: str
        Path to the directory where the search archive will be saved.
    name: str
        Name for the search.
    distributions: dict (str -> (list or distribution))
        Distributions to sample from. Can also be a list of samples.
    config: Config instance
        The base configuration.
    n_repeats: int
        Number of different random seeds to run each sample with.
    n_param_settings: int
        Number of parameter settings to sample. If not supplied, all
        possibilities are generated.
    _zip: bool
        Whether to zip the created search directory.
    add_date: bool
        Whether to add time to name of experiment directory.
    do_local_test: bool
        If True, run a short test using one of the sampled
        configs on the local machine to catch any dumb errors
        before starting the real experiment.
    readme: str
        String specifiying context/purpose of search.

    """
    if config.get('seed', None) is None:
        config.seed = gen_seed()

    with NumpySeed(config.seed):
        es = ExperimentStore(path, prefix="build_search")

        count = 0
        base_name = name
        has_built = False
        while not has_built:
            try:
                exp_dir = es.new_experiment(name,
                                            config.seed,
                                            add_date=add_date,
                                            force_fresh=1)
                has_built = True
            except FileExistsError:
                name = "{}_{}".format(base_name, count)
                count += 1

        if readme:
            with open(exp_dir.path_for('README.md'), 'w') as f:
                f.write(readme)

        print(config)
        exp_dir.record_environment(config=config)

        print("Building parameter search at {}.".format(exp_dir.path))

        job = Job(exp_dir.path)

        new_configs = sample_configs(distributions, n_repeats,
                                     n_param_settings)

        with open(exp_dir.path_for("sampled_configs.txt"), "w") as f:
            f.write("\n".join("idx={}: {}".format(c["idx"], pformat(c))
                              for c in new_configs))

        print("{} configs were sampled for parameter search.".format(
            len(new_configs)))

        if do_local_test:
            print("\nStarting local test " + ("=" * 80))
            test_config = new_configs[0].copy()
            test_config.update(max_steps=1000, render_hook=None)
            _RunTrainingLoop(config)(test_config)
            print("Done local test " + ("=" * 80) + "\n")

        job.map(_RunTrainingLoop(config.copy()), new_configs)

        job.save_object('metadata', 'distributions', distributions)
        job.save_object('metadata', 'config', config)

        print(job.summary())

        if _zip:
            path = job.zip(delete=True)
        else:
            path = exp_dir.path

        print("Zipped {} as {}.".format(exp_dir.path, path))

        return path, len(new_configs)
Exemplo n.º 5
0
Arquivo: train.py Projeto: alcinos/dps
    def run(self, start_time):
        """ Run the training loop.

        Parameters
        ----------
        start_time: int
            Start time (in seconds since epoch) for measuring elapsed time for
            purposes of interrupting the training loop.

        """
        if start_time is None:
            start_time = time.time()
        self.start_time = start_time

        self.timestamp("Entering TrainingLoop.run")

        prepare_func = cfg.get("prepare_func", None)
        if callable(prepare_func):
            prepare_func()  # Modify the config in arbitrary ways before training
        else:
            try:
                prepare_funcs = list(prepare_func)
            except (TypeError, ValueError):
                pass
            else:
                for f in prepare_funcs:
                    if callable(f):
                        f()

        self.curriculum = cfg.curriculum + []

        if cfg.seed is None or cfg.seed < 0:
            cfg.seed = gen_seed()

        # Create a directory to store the results of the training session.
        self.experiment_store = ExperimentStore(os.path.join(cfg.local_experiments_dir, cfg.env_name))
        exp_dir = self.experiment_store.new_experiment(
            self.exp_name, cfg.seed, add_date=1, force_fresh=1, update_latest=False)
        self.exp_dir = exp_dir
        cfg.path = exp_dir.path

        breaker = "-" * 40
        header = "{}\nREADME.md - {}\n{}\n\n\n".format(breaker, os.path.basename(exp_dir.path), breaker)
        readme = header + (cfg.readme if cfg.readme else "") + "\n\n"

        with open(exp_dir.path_for('README.md'), 'w') as f:
            f.write(readme)

        self.data = _TrainingLoopData(exp_dir)
        self.data.setup()

        frozen_data = None

        with ExitStack() as stack:
            if cfg.pdb:
                stack.enter_context(pdb_postmortem())
                print("`pdb` is turned on, so forcing setting robust=False")
                cfg.robust = False

            stack.enter_context(redirect_stream('stdout', self.data.path_for('stdout'), tee=cfg.tee))
            stack.enter_context(redirect_stream('stderr', self.data.path_for('stderr'), tee=cfg.tee))

            print("\n\n" + "=" * 80)
            self.timestamp("Starting training run (name={})".format(self.exp_name))

            print("\nDirectory for this training run is {}.".format(exp_dir.path))

            stack.enter_context(NumpySeed(cfg.seed))
            print("\nSet numpy random seed to {}.\n".format(cfg.seed))

            limiter = time_limit(
                self.time_remaining, verbose=True,
                timeout_callback=lambda limiter: print("Training run exceeded its time limit."))

            self.mpi_context = MPI_MasterContext(cfg.get('n_procs', 1), exp_dir)

            try:
                with limiter:
                    self._run()

            finally:
                self.data.summarize()

                self.timestamp("Done training run (name={})".format(self.exp_name))
                print("=" * 80)
                print("\n\n")

                frozen_data = self.data.freeze()

        self.timestamp("Leaving TrainingLoop.run")

        return frozen_data
Exemplo n.º 6
0
Arquivo: train.py Projeto: alcinos/dps
class TrainingLoop(object):
    """ A training loop.

    The behaviour of the training loop depends on the context stack that is active when it is
    run (i.e. `run` method is called), not the one that is active when it is created.

    Parameters
    ----------
    exp_name: str
        Name of the experiment, used as a prefix when creating a directory for storing data
        generated by the training run.

    """
    def __init__(self, exp_name=''):
        self.exp_name = exp_name or cfg.exp_name
        self.start_time = None

    @property
    def time_remaining(self):
        if cfg.max_time is None or cfg.max_time <= 0:
            return np.inf
        else:
            elapsed_time = time.time() - self.start_time
            return cfg.max_time - elapsed_time

    def edit_remaining_stage(self, idx, stage_config):
        if len(self.curriculum_remaining) < idx+1:
            for i in range(idx+1 - len(self.curriculum_remaining)):
                self.curriculum_remaining.append(dict())

        self.curriculum_remaining[idx].update(stage_config)

    def timestamp(self, message):
        print("{} ({}, {:.2f}s elapsed, {:.2f}s remaining)".format(
            message,
            datetime.datetime.now(),
            time.time() - self.start_time,
            self.time_remaining))

    def run(self, start_time):
        """ Run the training loop.

        Parameters
        ----------
        start_time: int
            Start time (in seconds since epoch) for measuring elapsed time for
            purposes of interrupting the training loop.

        """
        if start_time is None:
            start_time = time.time()
        self.start_time = start_time

        self.timestamp("Entering TrainingLoop.run")

        prepare_func = cfg.get("prepare_func", None)
        if callable(prepare_func):
            prepare_func()  # Modify the config in arbitrary ways before training
        else:
            try:
                prepare_funcs = list(prepare_func)
            except (TypeError, ValueError):
                pass
            else:
                for f in prepare_funcs:
                    if callable(f):
                        f()

        self.curriculum = cfg.curriculum + []

        if cfg.seed is None or cfg.seed < 0:
            cfg.seed = gen_seed()

        # Create a directory to store the results of the training session.
        self.experiment_store = ExperimentStore(os.path.join(cfg.local_experiments_dir, cfg.env_name))
        exp_dir = self.experiment_store.new_experiment(
            self.exp_name, cfg.seed, add_date=1, force_fresh=1, update_latest=False)
        self.exp_dir = exp_dir
        cfg.path = exp_dir.path

        breaker = "-" * 40
        header = "{}\nREADME.md - {}\n{}\n\n\n".format(breaker, os.path.basename(exp_dir.path), breaker)
        readme = header + (cfg.readme if cfg.readme else "") + "\n\n"

        with open(exp_dir.path_for('README.md'), 'w') as f:
            f.write(readme)

        self.data = _TrainingLoopData(exp_dir)
        self.data.setup()

        frozen_data = None

        with ExitStack() as stack:
            if cfg.pdb:
                stack.enter_context(pdb_postmortem())
                print("`pdb` is turned on, so forcing setting robust=False")
                cfg.robust = False

            stack.enter_context(redirect_stream('stdout', self.data.path_for('stdout'), tee=cfg.tee))
            stack.enter_context(redirect_stream('stderr', self.data.path_for('stderr'), tee=cfg.tee))

            print("\n\n" + "=" * 80)
            self.timestamp("Starting training run (name={})".format(self.exp_name))

            print("\nDirectory for this training run is {}.".format(exp_dir.path))

            stack.enter_context(NumpySeed(cfg.seed))
            print("\nSet numpy random seed to {}.\n".format(cfg.seed))

            limiter = time_limit(
                self.time_remaining, verbose=True,
                timeout_callback=lambda limiter: print("Training run exceeded its time limit."))

            self.mpi_context = MPI_MasterContext(cfg.get('n_procs', 1), exp_dir)

            try:
                with limiter:
                    self._run()

            finally:
                self.data.summarize()

                self.timestamp("Done training run (name={})".format(self.exp_name))
                print("=" * 80)
                print("\n\n")

                frozen_data = self.data.freeze()

        self.timestamp("Leaving TrainingLoop.run")

        return frozen_data

    def _run(self):
        print(cfg.to_string())

        threshold_reached = True
        self.global_step = 0
        self.n_global_experiences = 0
        self.curriculum_remaining = self.curriculum + []
        self.curriculum_complete = []

        stage_idx = 0
        while self.curriculum_remaining:
            print("\n" + "=" * 50)
            self.timestamp("Starting stage {}".format(stage_idx))
            print("\n")

            if cfg.start_tensorboard:
                restart_tensorboard(self.experiment_store.path, cfg.tbport, cfg.reload_interval)

            stage_config = self.curriculum_remaining.pop(0)
            stage_config = Config(stage_config)

            self.data.start_stage(stage_idx, stage_config)

            with ExitStack() as stack:

                # --------------- Stage set-up -------------------

                print("\n" + "-" * 10 + " Stage set-up " + "-" * 10)

                print("\nNew config values for this stage are: \n{}\n".format(pformat(stage_config)))
                stack.enter_context(stage_config)

                stage_prepare_func = cfg.get("stage_prepare_func", None)
                if callable(stage_prepare_func):
                    stage_prepare_func()  # Modify the stage config in arbitrary ways before starting stage

                self.mpi_context.start_stage()

                # Configure and create session and graph for stage.
                session_config = tf.ConfigProto()
                session_config.intra_op_parallelism_threads = cfg.get('intra_op_parallelism_threads', 0)
                session_config.inter_op_parallelism_threads = cfg.get('inter_op_parallelism_threads', 0)
                session_config.log_device_placement = cfg.get('log_device_placement', 0)

                if cfg.use_gpu:
                    per_process_gpu_memory_fraction = getattr(cfg, 'per_process_gpu_memory_fraction', None)
                    if per_process_gpu_memory_fraction:
                        session_config.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction

                    gpu_allow_growth = getattr(cfg, 'gpu_allow_growth', None)
                    if gpu_allow_growth:
                        session_config.gpu_options.allow_growth = gpu_allow_growth

                if cfg.use_gpu:
                    print("Using GPU if available.")
                    print("Using {}% of GPU memory.".format(
                        100 * session_config.gpu_options.per_process_gpu_memory_fraction))
                    print("Allowing growth of GPU memory: {}".format(session_config.gpu_options.allow_growth))

                graph = tf.Graph()
                sess = tf.Session(graph=graph, config=session_config)

                # This HAS to come after the creation of the session, otherwise
                # it allocates all GPU memory if using the GPU.
                print("\nAvailable devices: ")
                from tensorflow.python.client import device_lib
                print(device_lib.list_local_devices())

                if not cfg.use_gpu:
                    print("Not using GPU.")
                    stack.enter_context(graph.device("/cpu:0"))

                stack.enter_context(graph.as_default())
                stack.enter_context(sess)
                stack.enter_context(sess.as_default())

                # Set the seed for the stage. Notice we generate a new tf seed for each stage.
                tf_seed = gen_seed()
                print("Setting tensorflow seed to generated seed: {}\n".format(tf_seed))
                tf.set_random_seed(tf_seed)

                # Set limit on CPU RAM for the stage
                cpu_ram_limit_mb = cfg.get("cpu_ram_limit_mb", None)
                if cpu_ram_limit_mb is not None:
                    stack.enter_context(memory_limit(cfg.cpu_ram_limit_mb))

                print("Building env...\n")

                # Maybe build env
                if stage_idx == 0 or not cfg.preserve_env:
                    if getattr(self, 'env', None):
                        self.env.close()

                    self.env = cfg.build_env()

                if hasattr(self.env, "print_memory_footprint"):
                    self.env.print_memory_footprint()

                print("\nDone building env.\n")
                print("Building updater...\n")

                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter('once')

                    if cfg.n_procs > 1:
                        updater = cfg.get_updater(self.env, mpi_context=self.mpi_context)
                    else:
                        updater = cfg.get_updater(self.env)

                    updater.stage_idx = stage_idx
                    updater.exp_dir = self.exp_dir

                    updater.build_graph()
                    print("\nDone building updater.\n")

                walk_variable_scopes(max_depth=3)

                # Maybe initialize network weights.
                # Let a *path_specification* be one of three things:
                #     1. An integer specifying a stage to load the best hypothesis from.
                #     2. A string of format: "stage_idx,kind" where `stage_idx` specifies a stage to load from
                #        and `kind` is either "final" or "best", specifying whether to load final or best
                #        hypothesis from that stage.
                #     3. A path on the filesystem that gives a prefix for a tensorflow checkpoint file to load from.
                #
                # Then cfg.load_path can either be a path_specification itself, in which case all variables
                # in the network will be loaded from that path_specification, or a dictionary mapping from
                # variable scope names to path specifications, in which case all variables in each supplied
                # variable scope name will be loaded from the path_specification paired with that scope name.
                load_path = cfg.load_path
                if load_path is not None:
                    if isinstance(load_path, str) or isinstance(load_path, int):
                        load_path = {"": load_path}

                    load_path = dict(load_path)

                    # Sort in increasing order, so that it if one variable scope lies within another scope,
                    # the outer scope gets loaded before the inner scope, rather than having the outer scope
                    # wipe out the inner scope.
                    items = sorted(load_path.items())

                    for var_scope, path in items:
                        variables = {v.name: v for v in trainable_variables(var_scope, for_opt=False)}
                        if not variables:
                            print("No variables to load in scope {}.".format(str(var_scope)))
                            continue

                        saver = tf.train.Saver(variables)

                        load_stage, kind = None, None

                        if isinstance(path, int):
                            load_stage = path
                            kind = "best"
                        elif isinstance(path, str):
                            try:
                                split = path.split(',')
                                load_stage = int(split[0])
                                kind = 'best' if len(split) > 1 else split[1]
                                assert kind in 'best final'.split(), "path={}".format(path)
                            except Exception:
                                load_stage, kind = None, None

                        if load_stage is not None:
                            if stage_idx == 0:
                                print(
                                    "Not loading var scope \"{}\" from stage {}, "
                                    "currently in stage 0.".format(var_scope, load_stage))
                                continue
                            else:
                                key = kind + '_path'
                                completed_history = self.data.history[:-1]
                                path = completed_history[load_stage][key]

                        path = os.path.realpath(path)

                        saver.restore(tf.get_default_session(), path)

                        print("Loading var scope \"{}\" from {}.".format(var_scope, path))
                else:
                    print("Using a fresh set of weights, not loading anything.")

                tf.train.get_or_create_global_step()
                sess.run(uninitialized_variables_initializer())
                sess.run(tf.assert_variables_initialized())

                for hook in cfg.hooks:
                    assert isinstance(hook, Hook)
                    hook.start_stage(self, updater, stage_idx)

                threshold_reached = False
                reason = None

                try:
                    # --------------- Run stage -------------------

                    start = time.time()
                    phys_memory_before = memory_usage(physical=True)
                    gpu_memory_before = gpu_memory_usage()

                    threshold_reached, reason = self._run_stage(stage_idx, updater)

                except KeyboardInterrupt:
                    reason = "User interrupt"

                except NotImplementedError as e:
                    # There is a bug in pdb_postmortem that prevents instances of `NotImplementedError`
                    # from being handled properly, so replace it with an instance of `Exception`.
                    if cfg.robust:
                        traceback.print_exc()
                        reason = "Exception occurred ({})".format(repr(e))
                    else:
                        raise Exception("NotImplemented") from e

                except Exception as e:
                    reason = "Exception occurred ({})".format(repr(e))
                    if cfg.robust:
                        traceback.print_exc()
                    else:
                        raise

                except Alarm:
                    reason = "Time limit exceeded"
                    raise

                finally:
                    phys_memory_after = memory_usage(physical=True)
                    gpu_memory_after = gpu_memory_usage()

                    self.data.record_values_for_stage(
                        stage_duration=time.time()-start,
                        phys_memory_before_mb=phys_memory_before,
                        phys_memory_delta_mb=phys_memory_after - phys_memory_before,
                        gpu_memory_before_mb=gpu_memory_before,
                        gpu_memory_delta_mb=gpu_memory_after - gpu_memory_before
                    )

                    self.data.record_values_for_stage(reason=reason)

                    print("\n" + "-" * 10 + " Optimization complete " + "-" * 10)
                    print("\nReason: {}.\n".format(reason))

                    final_path = self.data.path_for('weights/final_for_stage_{}'.format(stage_idx))
                    final_path = cfg.get('save_path', final_path)
                    final_path = updater.save(tf.get_default_session(), final_path)
                    self.data.record_values_for_stage(final_path=final_path)

                    # --------------- Maybe render performance of best hypothesis -------------------

                    do_final_testing = (
                        "Exception occurred" not in reason
                        and reason != "Time limit exceeded"
                        and 'best_path' in self.data.current_stage_record)

                    if do_final_testing:
                        try:
                            print("\n" + "-" * 10 + " Final testing/rendering " + "-" * 10)

                            print("Best hypothesis for this stage was found on "
                                  "step (l: {best_local_step}, g: {best_global_step}) "
                                  "with stopping criteria ({sc_name}) of {best_stopping_criteria}.".format(
                                      sc_name=self.stopping_criteria_name, **self.data.current_stage_record))

                            best_path = self.data.current_stage_record['best_path']
                            print("Loading best hypothesis for this stage "
                                  "from file {}...".format(best_path))
                            updater.restore(sess, best_path)

                            test_record = updater.evaluate(cfg.batch_size, mode="test")

                            for hook in cfg.hooks:
                                if hook.call_per_timestep and hook.final:
                                    hook_record = hook.step(self, updater)

                                    if hook_record:
                                        assert len(hook_record) == 1
                                        for k, d in dict(hook_record).items():
                                            test_record.update(d)

                            self.data.record_values_for_stage(
                                **{'_test_' + k: v for k, v in test_record.items()})

                            if cfg.render_step > 0 and cfg.render_hook is not None:
                                print("Rendering...")
                                cfg.render_hook(updater)
                                print("Done rendering.")

                        except BaseException:
                            print("Exception occurred while performing final testing/rendering: ")
                            traceback.print_exc()

                    else:
                        print("\n" + "-" * 10 + " Skipping final testing/rendering " + "-" * 10)

                    # --------------- Finish up the stage -------------------

                    self.data.end_stage(updater.n_updates)

                    print("\n" + "-" * 10 + " Running end-of-stage hooks " + "-" * 10 + "\n")
                    for hook in cfg.hooks:
                        hook.end_stage(self, stage_idx)

                    print()
                    self.timestamp("Done stage {}".format(stage_idx))
                    print("=" * 50)

                    stage_idx += 1
                    self.curriculum_complete.append(stage_config)

                if not (threshold_reached or cfg.power_through):
                    print("Failed to reach stopping criteria threshold on stage {} "
                          "of the curriculum, terminating.".format(stage_idx))
                    break

    def _run_stage(self, stage_idx, updater):
        """ Run main training loop for a stage of the curriculum. """

        threshold_reached = False
        reason = "NotStarted"

        # Parse stopping criteria, set up early stopping
        stopping_criteria = cfg.get("stopping_criteria", None)
        if not stopping_criteria:
            stopping_criteria = updater.stopping_criteria

        if isinstance(stopping_criteria, str):
            stopping_criteria = stopping_criteria.split(",")

        self.stopping_criteria_name = stopping_criteria[0]
        if "max" in stopping_criteria[1]:
            self.maximize_sc = True
        elif "min" in stopping_criteria[1]:
            self.maximize_sc = False
        else:
            raise Exception("Ambiguous stopping criteria specification: {}".format(stopping_criteria[1]))

        early_stop = EarlyStopHook(patience=cfg.patience, maximize=self.maximize_sc)

        # Start stage
        print("\n" + "-" * 10 + " Training begins " + "-" * 10)
        self.timestamp("")
        print()

        total_hooks_time = 0.0
        time_per_hook = 0.0

        total_eval_time = 0.0
        time_per_eval = 0.0

        total_train_time = 0.0
        time_per_example = 0.0
        time_per_update = 0.0

        n_eval = 0

        while True:
            # Check whether to keep training
            if updater.n_updates >= cfg.max_steps:
                reason = "Maximum number of steps-per-stage reached"
                break

            if updater.n_experiences >= cfg.max_experiences:
                reason = "Maximum number of experiences-per-stage reached"
                break

            local_step = updater.n_updates
            global_step = self.global_step

            if local_step > 0 and local_step % cfg.checkpoint_step == 0:
                self.data.dump_data(local_step)

            evaluate = (local_step % cfg.eval_step) == 0
            display = (local_step % cfg.display_step) == 0
            render = (cfg.render_step > 0
                      and (local_step % cfg.render_step) == 0
                      and (local_step > 0 or cfg.render_first))

            data_to_store = []

            # --------------- Run hooks -------------------

            hooks_start = time.time()

            for hook in cfg.hooks:
                if hook.call_per_timestep:
                    run_hook = local_step == 0 and hook.initial
                    run_hook |= local_step > 0 and local_step % hook.n == 0

                    if run_hook:
                        hook_record = hook.step(self, updater, local_step)

                        if hook_record:
                            data_to_store.extend(dict(hook_record).items())

            hooks_duration = time.time() - hooks_start

            if render and cfg.render_hook is not None:
                print("Rendering...")
                cfg.render_hook(updater)
                print("Done rendering.")

            if display:
                print("Displaying...")
                self.data.summarize_current_stage(
                    local_step, global_step, updater.n_experiences, self.n_global_experiences)
                print("\nMy PID: {}\n".format(os.getpid()))
                print("Physical memory use: {}mb".format(memory_usage(physical=True)))
                print("Virtual memory use: {}mb".format(memory_usage(physical=False)))

                print("Avg time per update: {}s".format(time_per_update))
                print("Avg time per eval: {}s".format(time_per_eval))
                print("Avg time for hooks: {}s".format(time_per_hook))

                if cfg.use_gpu:
                    print(nvidia_smi())

            # --------------- Possibly evaluate -------------------

            if evaluate:
                print("Evaluating...")
                eval_start_time = time.time()
                val_record = updater.evaluate(cfg.batch_size, mode="val")
                eval_duration = time.time() - eval_start_time
                print("Done evaluating")

                val_record["duration"] = eval_duration

                n_eval += 1
                total_eval_time += eval_duration
                time_per_eval = total_eval_time / n_eval

                data_to_store.append(("val", val_record))

                if self.stopping_criteria_name not in val_record:
                    print("Stopping criteria {} not in record returned "
                          "by updater, using 0.0.".format(self.stopping_criteria_name))

                stopping_criteria = val_record.get(self.stopping_criteria_name, 0.0)
                new_best, stop = early_stop.check(stopping_criteria, local_step, val_record)

                if new_best:
                    print("Storing new best on step (l={}, g={}), "
                          "constituting (l={}, g={}) experiences, "
                          "with stopping criteria ({}) of {}.".format(
                              local_step, global_step,
                              updater.n_experiences, self.n_global_experiences,
                              self.stopping_criteria_name, stopping_criteria))

                    best_path = self.data.path_for(
                        'weights/best_of_stage_{}'.format(stage_idx))
                    best_path = cfg.get('save_path', best_path)

                    weight_start = time.time()
                    best_path = updater.save(tf.get_default_session(), best_path)

                    print("Done saving weights, took {} seconds".format(time.time() - weight_start))

                    self.data.record_values_for_stage(
                        best_path=best_path, best_global_step=global_step)
                    self.data.record_values_for_stage(
                        **{'best_' + k: v for k, v in early_stop.best.items()})

                if stop:
                    print("Early stopping triggered.")
                    reason = "Early stopping triggered"
                    break

                if self.maximize_sc:
                    threshold_reached = stopping_criteria >= cfg.threshold
                else:
                    threshold_reached = stopping_criteria <= cfg.threshold

                if threshold_reached:
                    reason = "Stopping criteria threshold reached"
                    break

            # --------------- Perform an update -------------------

            if cfg.do_train:
                if local_step % 100 == 0:
                    print("Running update step {}...".format(local_step))

                update_start_time = time.time()

                _old_n_experiences = updater.n_experiences

                update_record = updater.update(cfg.batch_size)

                update_duration = time.time() - update_start_time
                update_record["train"]["duration"] = update_duration

                if local_step % 100 == 0:
                    print("Done update step.")

                if local_step % 100 == 0:
                    start = time.time()
                    update_record["train"]["memory_physical_mb"] = memory_usage(physical=True)
                    update_record["train"]["memory_virtual_mb"] = memory_usage(physical=False)
                    update_record["train"]["memory_gpu_mb"] = gpu_memory_usage()
                    print("Memory check duration: {}".format(time.time() - start))

                data_to_store.extend(dict(update_record).items())

                n_experiences_delta = updater.n_experiences - _old_n_experiences
                self.n_global_experiences += n_experiences_delta

                total_train_time += update_duration
                time_per_example = total_train_time / updater.n_experiences
                time_per_update = total_train_time / updater.n_updates

                total_hooks_time += hooks_duration
                time_per_hook = total_hooks_time / updater.n_updates

            # --------------- Store data -------------------

            records = defaultdict(dict)
            for mode, r in data_to_store:
                records[mode].update(r)

            self.data.store_step_data_and_summaries(
                stage_idx, local_step, global_step,
                updater.n_experiences, self.n_global_experiences,
                **records)

            self.data.record_values_for_stage(
                time_per_example=time_per_example,
                time_per_update=time_per_update,
                time_per_eval=time_per_eval,
                time_per_hook=time_per_hook,
                n_steps=local_step,
                n_experiences=updater.n_experiences,
            )

            self.global_step += 1

            # If `do_train` is False, we do no training and evaluate
            # exactly once, so only one iteration is required.
            if not cfg.do_train:
                reason = "`do_train` set to False"
                break

        return threshold_reached, reason
Exemplo n.º 7
0
def make_dataset_in_parallel(run_kwargs, dataset_cls, param_values=None):
    """ Uses dps.hyper.parallel_session.ParallelSession to create a dataset in parallel. """

    # Get run_kwargs from command line
    sig = inspect.signature(ParallelSession.__init__)
    default_run_kwargs = sig.bind_partial()
    default_run_kwargs.apply_defaults()
    cl_run_kwargs = clify.command_line(default_run_kwargs.arguments).parse()
    run_kwargs.update(cl_run_kwargs)

    param_values = param_values or dataset_cls._capture_param_values()
    param_values = Config(param_values)
    seed = param_values["seed"]
    if seed is None or seed < 0:
        seed = gen_seed()

    n_examples = param_values["n_examples"]
    n_examples_per_shard = run_kwargs["n_examples_per_shard"]

    experiment_store = ExperimentStore(
        cfg.parallel_experiments_build_dir, prefix="build_{}".format(dataset_cls.__name__))

    count = 0
    name = "attempt=0"
    has_built = False
    while not has_built:
        try:
            exp_dir = experiment_store.new_experiment(name, seed, add_date=True, force_fresh=True)
            has_built = True
        except FileExistsError:
            count += 1
            name = "attempt_{}".format(count)

    print("Building dataset.")

    job = Job(exp_dir.path)
    n_examples_remaining = n_examples

    with NumpySeed(seed):
        inputs = []
        idx = 0
        while n_examples_remaining:
            seed = gen_seed()
            cur_n_examples = min(n_examples_remaining, n_examples_per_shard)
            n_examples_remaining -= cur_n_examples

            inputs.append((idx, seed, cur_n_examples))
            idx += 1

        job.map(_BuildDataset(dataset_cls, param_values), inputs)
        job.save_object('metadata', 'param_values', param_values)

    print(job.summary())
    archive_path = job.zip(delete=True)
    print("Zipped {} as {}.".format(exp_dir.path, archive_path))

    run_kwargs = run_kwargs.copy()

    del run_kwargs['n_examples_per_shard']

    run_kwargs.update(
        archive_path=archive_path, name=name, kind="parallel",
        parallel_exe=cfg.parallel_exe)
    parallel_session = submit_job(**run_kwargs)

    with cd(os.path.join(parallel_session.job_path, 'experiments')):
        dataset_files = []
        for dir_path, dirs, files in os.walk('.'):
            if not dir_path.startswith("./exp__seed="):
                continue

            df = [f for f in files if not f.endswith('.cfg')]
            assert len(df) == 1
            dataset_files.append(os.path.join(dir_path, df[0]))

        cached_filename = os.path.join(cfg.data_dir, "cached_datasets", dataset_cls.__name__, str(get_param_hash(param_values)))

        command = "cat " + " ".join(dataset_files) + " > " + cached_filename
        print("Running command: \n" + command)
        subprocess.run(command, shell=True, check=True)
        print("Done.")

        with open(cached_filename + ".cfg", 'w') as f:
            f.write(pprint.pformat(param_values))

    return parallel_session