示例#1
0
def process_omniglot(data_dir, quiet):
    try:
        omniglot_dir = process_path(os.path.join(data_dir, 'omniglot'))

        if _validate_omniglot(omniglot_dir):
            print("Omniglot data seems to be present already.")
            return
        else:
            try:
                shutil.rmtree(omniglot_dir)
            except FileNotFoundError:
                pass

        os.makedirs(omniglot_dir, exist_ok=False)

        with cd(omniglot_dir):
            subprocess.run("git clone https://github.com/brendenlake/omniglot --depth=1".split(), check=True)

            with cd('omniglot/python'):
                zip_ref = zipfile.ZipFile('images_evaluation.zip', 'r')
                zip_ref.extractall('.')
                zip_ref.close()

                zip_ref = zipfile.ZipFile('images_background.zip', 'r')
                zip_ref.extractall('.')
                zip_ref.close()

            subprocess.run('mv omniglot/python/images_background/* .', shell=True, check=True)
            subprocess.run('mv omniglot/python/images_evaluation/* .', shell=True, check=True)
        print("Done setting up Omniglot data.")
    finally:
        try:
            shutil.rmtree(os.path.join(omniglot_dir, 'omniglot'))
        except FileNotFoundError:
            pass
示例#2
0
def download_backgrounds(data_dir):
    """
    Download backgrounds. Result is that a file called `emnist-byclass.mat` is stored in `data_dir`.

    Parameters
    ----------
    path: str
        Path to directory where files should be stored.

    """
    with cd(data_dir):
        if not os.path.exists('backgrounds'):
            command = "git clone {}".format(background_url).split()
            subprocess.run(command, check=True)
示例#3
0
def _download_emnist(data_dir):
    """
    Download the emnist data. Result is that a directory called "emnist_raw"
    is created inside `data_dir` which contains 4 files.

    Parameters
    ----------
    path: str
        Path to directory where files should be stored.

    """
    emnist_raw_dir = os.path.join(data_dir, "emnist_raw")
    os.makedirs(emnist_raw_dir, exist_ok=True)

    with cd(emnist_raw_dir):
        if not os.path.exists('gzip.zip'):
            print("Downloading...")
            command = "wget --output-document=gzip.zip {}".format(
                emnist_url).split()
            subprocess.run(command, check=True)
        else:
            print("Found existing copy of gzip.zip, not downloading.")

        print("Extracting...")
        for fname in emnist_gz_names:
            if not os.path.exists(fname):
                subprocess.run('unzip gzip.zip gzip/{}'.format(fname),
                               shell=True,
                               check=True)
                shutil.move('gzip/{}'.format(fname), '.')
            else:
                print("{} already exists, skipping extraction.".format(fname))

        try:
            shutil.rmtree('gzip')
        except FileNotFoundError:
            pass

    return emnist_raw_dir
示例#4
0
def submit_job(
        archive_path, category, exp_name, wall_time="1year", tasks_per_node=1, cpus_per_task=1, mem_per_cpu=0,
        queue="", kind="local", gpu_set="", project="rrg-bengioy-ad_gpu", installation_script_path=None,
        gpu_kind=None, **run_kwargs):

    assert kind in "slurm slurm-local".split()

    run_kwargs.update(
        wall_time=wall_time, tasks_per_node=tasks_per_node, cpus_per_task=cpus_per_task,
        kind=kind, gpu_set=gpu_set, mem_per_cpu=mem_per_cpu)

    run_kwargs['env_vars'] = dict(TF_CPP_MIN_LOG_LEVEL=3, CUDA_VISIBLE_DEVICES='-1')
    run_kwargs['dry_run'] = False

    scratch = os.path.join(cfg.parallel_experiments_run_dir, category)

    session = ParallelSession(exp_name, archive_path, 'map', scratch=scratch, **run_kwargs)

    job_path = session.job_path

    # Not strictly required if kind == "parallel", but do it anyway for completeness.
    with open(os.path.join(job_path, "session.pkl"), 'wb') as f:
        dill.dump(session, f, protocol=dill.HIGHEST_PROTOCOL, recurse=True)

    if kind == "slurm-local":
        session.run()
        return session

    if not installation_script_path:
        raise Exception()

    installation_script_path = os.path.realpath(installation_script_path)

    entry_script = """#!/bin/bash
echo "Building venv..."
echo "Command: "
echo "srun -v --nodes=$SLURM_JOB_NUM_NODES --ntasks=$SLURM_JOB_NUM_NODES {installation_script_path}"
srun -v --nodes="$SLURM_JOB_NUM_NODES" --ntasks=$SLURM_JOB_NUM_NODES {installation_script_path}

echo "Sourcing venv..."
source "$SLURM_TMPDIR/env/bin/activate"

cd {job_path}

echo "Dropping into python..."
python run.py
""".format(installation_script_path=installation_script_path, job_path=job_path)
    with open(os.path.join(job_path, "run.sh"), 'w') as f:
        f.write(entry_script)

    python_script = """#!{}
import datetime
start = datetime.datetime.now()
print("Starting job at " + str(start))
import dill
with open("./session.pkl", "rb") as f:
    session = dill.load(f)
session.run()
end = datetime.datetime.now()
print("Finishing job at " + str(end))
print(str((end - start).total_seconds()) + " seconds elapsed between start and finish.")

""".format(sys.executable)
    with open(os.path.join(job_path, "run.py"), 'w') as f:
        f.write(python_script)

    wall_time_minutes = int(np.ceil(session.wall_time_seconds / 60))
    resources = "--nodes={} --ntasks-per-node={} --cpus-per-task={} --time={}".format(
        session.n_nodes, session.tasks_per_node, cpus_per_task, wall_time_minutes)

    if mem_per_cpu:
        resources = "{} --mem-per-cpu={}".format(resources, mem_per_cpu)

    if gpu_set:
        n_gpus = len([int(i) for i in gpu_set.split(',')])
        if gpu_kind:
            gpu_string = f"--gres=gpu:{gpu_kind}:{n_gpus}"
        else:
            gpu_string = f"--gres=gpu:{n_gpus}"

        resources = resources + ' ' + gpu_string

    email = "*****@*****.**"
    if queue:
        queue = "-p " + queue
    command = (
        "sbatch --job-name {exp_name} -D {job_path} --mail-type=ALL [email protected] "
        "-A {project} {queue} --export=ALL {resources} "
        "-o stdout -e stderr run.sh".format(
            exp_name=exp_name, job_path=job_path, email=email, project=project,
            queue=queue, resources=resources
        )
    )

    print("\n" + "~" * 40)
    print(command)

    with cd(job_path):
        subprocess.run(command.split())
    return session
示例#5
0
    def run(self):
        if self.dry_run:
            print("Dry run, so not running.")
            return

        # Have to jump through a hoop to get the proper node-local storage on cedar/graham.
        self.local_scratch_prefix = self.get_slurm_var("SLURM_TMPDIR")
        self.local_scratch = os.path.join(
            self.local_scratch_prefix,
            os.path.basename(self.job_path))

        # Compute new time limits based on the actual time remaining (protect against e.g. job starting late)

        print("Time limits before adjustment:")
        self.print_time_limits()

        job_id = os.getenv("SLURM_JOBID")
        command = 'squeue -h -j {} -o "%L"'.format(job_id)
        returncode, stdout, stderr = self.execute_command(command, frmt=False, robust=False)
        days = 0
        if "-" in stdout:
            days, time = stdout.split("-")
            days = int(days)
        else:
            time = stdout

        time = time.split(":")

        hours = int(time[-3]) if len(time) > 2 else 0
        minutes = int(time[-2]) if len(time) > 1 else 0
        seconds = int(time[-1])

        wall_time_delta = datetime.timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
        wall_time_seconds = int(wall_time_delta.total_seconds())

        print("Actual remaining walltime: {}".format(wall_time_delta))
        print("Time limits after adjustment:")

        (self.wall_time_seconds, self.total_seconds_per_step,
         self.parallel_seconds_per_step, self.python_seconds_per_step) = \
            self.compute_time_limits(
                wall_time_seconds, self.cleanup_time, self.slack_time, self.step_time_limit, self.n_steps)

        self.print_time_limits()

        with cd(self.job_path):
            print("\n" + ("=" * 80))
            job_start = datetime.datetime.now()
            print("Starting job at {}".format(job_start))

            job = ReadOnlyJob(self.input_zip)
            subjobs_remaining = sorted([op.idx for op in job.ready_incomplete_ops(sort=False)])

            n_failures = defaultdict(int)
            dead_jobs = set()

            i = 0
            while subjobs_remaining:
                step_start = datetime.datetime.now()

                print("\nStarting step {} at: ".format(i) + "=" * 90)
                print("{} ({} since start of job)".format(step_start, step_start - job_start))

                p = subprocess.run(
                    'scontrol show hostnames $SLURM_JOB_NODELIST', stdout=subprocess.PIPE, shell=True)
                host_pool = list(set([host.strip() for host in p.stdout.decode().split('\n') if host]))

                self.hosts, n_tasks_for_step = self.recruit_hosts(
                    host_pool, self.tasks_per_node, max_tasks=len(subjobs_remaining))

                indices_for_step = subjobs_remaining[:n_tasks_for_step]
                self._step(i, indices_for_step)
                self._checkpoint(i)

                job = ReadOnlyJob(self.archive_root)

                subjobs_remaining = set([op.idx for op in job.ready_incomplete_ops(sort=False)])

                for j in indices_for_step:
                    if j in subjobs_remaining:
                        n_failures[j] += 1
                        if n_failures[j] > self.n_retries:
                            print("All {} attempts at completing job with index {} have failed, "
                                  "permanently removing it from set of eligible jobs.".format(n_failures[j], j))
                            dead_jobs.add(j)

                subjobs_remaining = [idx for idx in subjobs_remaining if idx not in dead_jobs]
                subjobs_remaining = sorted(subjobs_remaining)

                i += 1

                print("Step duration: {}.".format(datetime.datetime.now() - step_start))

            self.execute_command("rm -rf {archive_root}", robust=True)

        print("Cleaning up dirty hosts...")
        command = "rm -rf {local_scratch}"
        for host in self.dirty_hosts:
            print("Cleaning host {}...".format(host))
            self.ssh_execute(command, host, robust=True)
示例#6
0
    def __init__(
            self, name, input_zip, pattern, scratch, local_scratch_prefix='/tmp/dps/hyper/', ppn=12, cpp=1,
            pmem=None, wall_time="1hour", cleanup_time="1min", slack_time="1min", add_date=True, dry_run=0,
            parallel_exe=None, kind="parallel", host_pool=None, load_avg_threshold=8., min_hosts=None,
            max_hosts=1, env_vars=None, output_to_files=True, n_retries=0, gpu_set="", copy_venv="",
            python_startup=False, step_time_limit=None, ignore_gpu=False, ssh_options=None, loud_output=True,
            rsync_verbosity=0):

        args = locals().copy()
        del args['self']

        print("\nParallelSession args:")
        print(args)

        launch_venv = os.getenv('VIRTUAL_ENV')
        if launch_venv:
            launch_venv = os.path.split(launch_venv)[1]

        if not parallel_exe:
            parallel_exe = "$HOME/.local/bin/parallel"

        if ssh_options is None:
            ssh_options = (
                "-oPasswordAuthentication=no "
                "-oStrictHostKeyChecking=no "
                "-oConnectTimeout=5 "
                "-oServerAliveInterval=2"
            )

        if kind == "pbs":
            local_scratch_prefix = "\\$RAMDISK"

        assert kind in "parallel pbs slurm slurm-local".split()
        hpc = kind != "parallel"

        # Create directory to run the job from - should be on scratch.
        scratch = os.path.abspath(os.path.expandvars(scratch))

        es = ExperimentStore(scratch, prefix="run_search")

        job_dir = es.new_experiment(name, 0, add_date=add_date, force_fresh=1)
        job_dir.record_environment()

        with open(job_dir.path_for('run_kwargs.json'), 'w') as f:
            json.dump(args, f, default=str, indent=4, sort_keys=True)
        del f
        del args

        job_path = job_dir.path
        job_dir.make_directory('experiments')

        input_zip_stem = path_stem(input_zip)
        input_zip = shutil.copy(input_zip, job_dir.path_for("orig.zip"))
        input_zip_abs = process_path(input_zip)
        input_zip_base = os.path.basename(input_zip)
        archive_root = zip_root(input_zip)

        self.copy_files(
            job_dir, input_zip, archive_root,
            ["README.md", "sampled_configs.txt", "config.json", "config.pkl"])

        # storage local to each node, from the perspective of that node
        local_scratch = os.path.join(local_scratch_prefix, os.path.basename(job_path))

        output_to_files = "--output-to-files" if output_to_files else ""

        env = os.environ.copy()

        env_vars = env_vars or {}

        env.update({e: str(v) for e, v in env_vars.items()})
        env_vars = ' '.join('--env ' + k for k in env_vars)

        rsync_verbosity = "" if not rsync_verbosity else "-" + "v" * rsync_verbosity

        ro_job = ReadOnlyJob(input_zip)
        indices_to_run = sorted([op.idx for op in ro_job.ready_incomplete_ops(sort=False)])
        del ro_job
        n_jobs_to_run = len(indices_to_run)
        if n_jobs_to_run == 0:
            print("All jobs are finished! Exiting.")
            return

        dirty_hosts = set()

        if hpc:
            host_pool = []
            n_nodes = max_hosts
            n_procs = n_nodes * ppn
            n_steps = int(np.ceil(n_jobs_to_run / n_procs))
        else:
            self.__dict__.update(locals())

            host_pool = host_pool or DEFAULT_HOST_POOL
            if isinstance(host_pool, str):
                host_pool = host_pool.split()

            # Get an estimate of the number of hosts we'll have available.
            with cd(job_path):
                hosts, n_procs = self.recruit_hosts(
                    hpc, min_hosts, max_hosts, host_pool,
                    ppn, max_procs=np.inf)
            n_nodes = len(hosts)

            if n_jobs_to_run < n_procs:
                n_steps = 1
                n_nodes = int(np.ceil(n_jobs_to_run / ppn))
                n_procs = n_nodes * ppn
                hosts = hosts[:n_nodes]
            else:
                n_steps = int(np.ceil(n_jobs_to_run / n_procs))

        node_file = " --sshloginfile nodefile.txt "

        wall_time_seconds, total_seconds_per_step, parallel_seconds_per_step, python_seconds_per_step = \
            self.compute_time_limits(wall_time, cleanup_time, slack_time, step_time_limit, n_steps)

        self.__dict__.update(locals())

        self.print_time_limits()
示例#7
0
def submit_job(
        archive_path, name, wall_time="1year", ppn=1, cpp=1, pmem=0,
        queue="", kind="local", gpu_set="", project="rpp-bengioy", **run_kwargs):

    assert kind in "pbs slurm slurm-local parallel".split()

    if "slurm" in kind and not pmem:
        raise Exception("Must supply a value for pmem (per-process-memory in mb) when using SLURM")

    run_kwargs.update(
        wall_time=wall_time, ppn=ppn, cpp=cpp, kind=kind,
        gpu_set=gpu_set, pmem=pmem)

    run_kwargs['env_vars'] = dict(TF_CPP_MIN_LOG_LEVEL=3, CUDA_VISIBLE_DEVICES='-1')
    run_kwargs['dry_run'] = False

    session = ParallelSession(
        name, archive_path, 'map', cfg.parallel_experiments_run_dir, **run_kwargs)

    job_path = session.job_path

    # Not strictly required if kind == "parallel", but do it anyway for completeness.
    with open(os.path.join(job_path, "session.pkl"), 'wb') as f:
        dill.dump(session, f, protocol=dill.HIGHEST_PROTOCOL, recurse=True)

    if kind in "parallel slurm-local".split():
        session.run()
        return session

    python_script = """#!{}
import datetime
start = datetime.datetime.now()
print("Starting job at " + str(start))
import dill
with open("./session.pkl", "rb") as f:
    session = dill.load(f)
session.run()
end = datetime.datetime.now()
print("Finishing job at " + str(end))
print(str((end - start).total_seconds()) + " seconds elapsed between start and finish.")

""".format(sys.executable)
    with open(os.path.join(job_path, "run.py"), 'w') as f:
        f.write(python_script)

    if kind == "pbs":
        resources = "nodes={}:ppn={},walltime={}".format(session.n_nodes, session.ppn, session.wall_time_seconds)
        if pmem:
            resources = "{},pmem={}mb".format(resources, pmem)

        email = "*****@*****.**"
        if queue:
            queue = "-q " + queue
        command = (
            "qsub -N {name} -d {job_path} -w {job_path} -m abe -M {email} "
            "-A {project} {queue} -V -l {resources} "
            "-j oe output.txt run.py".format(
                name=name, job_path=job_path, email=email, project=project,
                queue=queue, resources=resources
            )
        )

    elif kind == "slurm":
        wall_time_minutes = int(np.ceil(session.wall_time_seconds / 60))
        resources = "--nodes={} --ntasks-per-node={} --cpus-per-task={} --time={}".format(
            session.n_nodes, session.ppn, cpp, wall_time_minutes)

        if pmem:
            resources = "{} --mem-per-cpu={}mb".format(resources, pmem)

        if gpu_set:
            n_gpus = len([int(i) for i in gpu_set.split(',')])
            resources = "{} --gres=gpu:{}".format(resources, n_gpus)

        email = "*****@*****.**"
        if queue:
            queue = "-p " + queue
        command = (
            "sbatch --job-name {name} -D {job_path} --mail-type=ALL [email protected] "
            "-A {project} {queue} --export=ALL {resources} "
            "-o stdout -e stderr run.py".format(
                name=name, job_path=job_path, email=email, project=project,
                queue=queue, resources=resources
            )
        )

    else:
        raise Exception()

    print("\n" + "~" * 40)
    print(command)

    with cd(job_path):
        subprocess.run(command.split())
    return session
示例#8
0
from dps.utils import cd

parser = argparse.ArgumentParser(
    "Test reinforcement learning on grid_arithmetic. "
    "Run for each new commit to make sure that it still works."
)
parser.add_argument("kind", choices="parallel slurm".split())
parser.add_argument("length", choices="short long".split())
parser.add_argument("queue", choices="cpu gpu".split())

args = parser.parse_args()

if args.kind == "parallel":
    pass
elif args.kind == "slurm":
    with cd(os.path.dirname(dps.__file__)):
        sha = subprocess.check_output("git rev-parse --verify --short HEAD".split()).decode().strip()

    hostname = socket.gethostname()
    if "gra" in hostname:
        resources = "--max-hosts=4 --ppn=8 --pmem=3800"
        gpu = "--gpu-set=0,1 --ignore-gpu=True"
    elif "cedar" in hostname:
        if args.queue == "gpu":
            resources = "--max-hosts=5 --ppn=6 --pmem=7700"
        else:
            resources = "--max-hosts=4 --ppn=8 --pmem=3800"
        gpu = "--gpu-set=0,1,2,3 --ignore-gpu=True"
    else:
        raise Exception("Unknown host: {}".format(hostname))
示例#9
0
def maybe_download_emnist(data_dir, quiet=0, shape=None):
    """
    Download emnist data if it hasn't already been downloaded. Do some
    post-processing to put it in a more useful format. End result is a directory
    called `emnist-byclass` which contains a separate pklz file for each emnist
    class.

    Pixel values of stored images are uint8 values up to 255.
    Images for each class are put into a numpy array with shape (n_images_in_class, 28, 28).
    This numpy array is pickled and stored in a zip file with name <class char>.pklz.

    Parameters
    ----------
    data_dir: str
         Directory where files should be stored.

    """
    emnist_dir = os.path.join(data_dir, 'emnist')

    if _validate_emnist(emnist_dir):
        print("EMNIST data seems to be present already.")
    else:
        print("EMNIST data not found, downloading and processing...")
        try:
            shutil.rmtree(emnist_dir)
        except FileNotFoundError:
            pass

        raw_dir = _download_emnist(data_dir)

        with cd(raw_dir):
            images, labels = _emnist_load_helper(emnist_gz_names[0],
                                                 emnist_gz_names[1])
            images1, labels1 = _emnist_load_helper(emnist_gz_names[2],
                                                   emnist_gz_names[3])

        with cd(data_dir):
            os.makedirs('emnist', exist_ok=False)

            print("Processing...")
            with cd('emnist'):
                x = np.concatenate((images, images1), 0)
                y = np.concatenate((labels, labels1), 0)

                # Give images the right orientation so that plt.imshow(x[0]) just works.
                x = np.moveaxis(x.reshape(-1, 28, 28), 1, 2)

                for i in sorted(set(y.flatten())):
                    keep = y == i
                    x_i = x[keep.flatten(), :]
                    if i >= 36:
                        char = chr(i - 36 + ord('a'))
                    elif i >= 10:
                        char = chr(i - 10 + ord('A'))
                    else:
                        char = str(i)

                    if quiet >= 2:
                        pass
                    elif quiet == 1:
                        print(char)
                    elif quiet <= 0:
                        print(char)
                        print(image_to_string(x_i[0, ...]))

                    file_i = char + '.pklz'
                    with gzip.open(file_i, 'wb') as f:
                        dill.dump(x_i, f, protocol=dill.HIGHEST_PROTOCOL)

    if shape is not None:
        maybe_convert_emnist_shape(data_dir, shape)
示例#10
0
文件: download.py 项目: alcinos/dps
def _validate_omniglot(path):
    if not os.path.isdir(path):
        return False

    with cd(path):
        return set(os.listdir(path)) == set(omniglot_alphabets)
示例#11
0
def make_dataset_in_parallel(run_kwargs, dataset_cls, param_values=None):
    """ Uses dps.hyper.parallel_session.ParallelSession to create a dataset in parallel. """

    # Get run_kwargs from command line
    sig = inspect.signature(ParallelSession.__init__)
    default_run_kwargs = sig.bind_partial()
    default_run_kwargs.apply_defaults()
    cl_run_kwargs = clify.command_line(default_run_kwargs.arguments).parse()
    run_kwargs.update(cl_run_kwargs)

    param_values = param_values or dataset_cls._capture_param_values()
    param_values = Config(param_values)
    seed = param_values["seed"]
    if seed is None or seed < 0:
        seed = gen_seed()

    n_examples = param_values["n_examples"]
    n_examples_per_shard = run_kwargs["n_examples_per_shard"]

    experiment_store = ExperimentStore(
        cfg.parallel_experiments_build_dir, prefix="build_{}".format(dataset_cls.__name__))

    count = 0
    name = "attempt=0"
    has_built = False
    while not has_built:
        try:
            exp_dir = experiment_store.new_experiment(name, seed, add_date=True, force_fresh=True)
            has_built = True
        except FileExistsError:
            count += 1
            name = "attempt_{}".format(count)

    print("Building dataset.")

    job = Job(exp_dir.path)
    n_examples_remaining = n_examples

    with NumpySeed(seed):
        inputs = []
        idx = 0
        while n_examples_remaining:
            seed = gen_seed()
            cur_n_examples = min(n_examples_remaining, n_examples_per_shard)
            n_examples_remaining -= cur_n_examples

            inputs.append((idx, seed, cur_n_examples))
            idx += 1

        job.map(_BuildDataset(dataset_cls, param_values), inputs)
        job.save_object('metadata', 'param_values', param_values)

    print(job.summary())
    archive_path = job.zip(delete=True)
    print("Zipped {} as {}.".format(exp_dir.path, archive_path))

    run_kwargs = run_kwargs.copy()

    del run_kwargs['n_examples_per_shard']

    run_kwargs.update(
        archive_path=archive_path, name=name, kind="parallel",
        parallel_exe=cfg.parallel_exe)
    parallel_session = submit_job(**run_kwargs)

    with cd(os.path.join(parallel_session.job_path, 'experiments')):
        dataset_files = []
        for dir_path, dirs, files in os.walk('.'):
            if not dir_path.startswith("./exp__seed="):
                continue

            df = [f for f in files if not f.endswith('.cfg')]
            assert len(df) == 1
            dataset_files.append(os.path.join(dir_path, df[0]))

        cached_filename = os.path.join(cfg.data_dir, "cached_datasets", dataset_cls.__name__, str(get_param_hash(param_values)))

        command = "cat " + " ".join(dataset_files) + " > " + cached_filename
        print("Running command: \n" + command)
        subprocess.run(command, shell=True, check=True)
        print("Done.")

        with open(cached_filename + ".cfg", 'w') as f:
            f.write(pprint.pformat(param_values))

    return parallel_session