예제 #1
0
def register_barrier():
    """So we don't have multiple jobs running simulateously"""
    register_job(
        user="******",
        project="sgd",
        experiment=experiment,
        job="barrier",
        priority=priority,
        n_workers=16,
        config_overrides={},
        runtime_environment={"clone": {"code_package": code_package}, "script": "barrier.py"},
        annotations={"description": description},
    )
예제 #2
0
def schedule(name, config, skip_existing=False):
    # Skip pre-existing entries
    if (skip_existing and mongo.job.count_documents({
            "project": project,
            "job": name,
            "experiment": experiment
    }) > 0):
        return
    config = {**base_config, **config}
    n_workers = config["n_workers"]
    job_id = register_job(
        user="******",
        project=project,
        experiment=experiment,
        job=name,
        n_workers=n_workers,
        priority=10,
        config_overrides=config,
        runtime_environment={
            "clone": {
                "code_package": code_package
            },
            "script": script
        },
        annotations={"description": description},
    )
    print(
        f'sbatch --ntasks {n_workers} --job-name="{name}" --gpus-per-task=1 --cpus-per-task=8 --wrap="srun jobrun {job_id} --mpi"'
    )
예제 #3
0
def schedule(name, config, skip_existing=True):
    # Skip pre-existing entries
    config = {**base_config, **config}
    if (skip_existing and mongo.job.count_documents(
        {
            "project": project,
            "job": name,
            "experiment": experiment,
            "config.learning_rate": config["learning_rate"],
        }) > 0):
        return
    job_id = register_job(
        user="******",
        project=project,
        experiment=experiment,
        job=name,
        n_workers=n_workers,
        priority=10,
        config_overrides=config,
        runtime_environment={
            "clone": {
                "code_package": code_package
            },
            "script": script
        },
        annotations={"description": description},
    )
    ids.append(job_id)
예제 #4
0
 job_id = register_job(
     user="******",
     project="sgd",
     experiment=experiment,
     job=name,
     n_workers=n_workers,
     priority=10,
     config_overrides={
         "seed":
         seed,
         "distributed_backend":
         "nccl",
         "optimizer_scale_lr_with_factor":
         n_workers,
         "num_epochs":
         300,
         "log_verbosity":
         1,
         **shared.sgd_config(learning_rate,
                             momentum=0.9,
                             weight_decay=0.0001),
         **shared.optimizer_config(reducer),
         "optimizer_reducer_compression":
         1 / compression,
     },
     runtime_environment={
         "clone": {
             "code_package": code_package
         },
         "script": "train.py"
     },
     annotations={"description": description},
 )
예제 #5
0
    name = f"{reducer}_{n_workers:02d}workers_lr{learning_rate}"
    if mongo.job.count_documents({"job": name, "experiment": experiment}) > 0:
        # We have this one already
        continue
    job_id = register_job(
        user="******",
        project="sgd",
        experiment=experiment,
        job=name,
        n_workers=n_workers,
        priority=20,
        config_overrides={
            "seed": seed,
            "distributed_backend": "nccl",
            "optimizer_scale_lr_with_factor": n_workers,
            **shared.language_modeling_base(),
            **shared.sgd_config(learning_rate, momentum=0.0, weight_decay=0.0),
            **shared.optimizer_config(reducer),
        },
        runtime_environment={
            "clone": {
                "code_package": code_package
            },
            "script": "train.py"
        },
        annotations={"description": description},
    )
    print("{} - {}".format(job_id, name))
    registered_ids.append(job_id)

# kubernetes_schedule_job_queue(
예제 #6
0
 job_id = register_job(
     user="******",
     project="sgd",
     experiment=experiment,
     job=name,
     priority=seed + (100 if backend == "nccl" else 0),
     n_workers=n_workers,
     config_overrides={
         "seed":
         10000 + seed,
         "optimizer_scale_lr_with_factor":
         n_workers,
         "distributed_backend":
         backend,
         "log_verbosity":
         log_level,
         "num_epochs":
         10,
         **shared.sgd_config(0.1,
                             momentum=0.9,
                             weight_decay=0.0001),
         **shared.optimizer_config(reducer),
     },
     runtime_environment={
         "clone": {
             "code_package": code_package
         },
         "script": "train.py",
     },
     annotations={"description": description},
 )
예제 #7
0
sleep(0.1)

for backend in ["nccl", "gloo"]:
    for n_workers in [2, 4, 8, 16]:
        name = f"time_{n_workers}workers_{backend}"
        if mongo.job.count_documents({"job": name, "experiment": experiment}) > 0:
            # We have this one already
            continue
        job_id = register_job(
            user="******",
            project="sgd",
            experiment=experiment,
            job=name,
            n_workers=n_workers,
            priority=priority,
            config_overrides={
                "distributed_backend": backend,
                "repetitions": 20,
                "device": "cuda",
                "n_workers": n_workers,
            },
            runtime_environment={"clone": {"code_package": code_package}, "script": "timings.py"},
            annotations={"description": description},
        )
        print("{} - {}".format(job_id, name))
        registered_ids.append(job_id)

        sleep(0.1)
        register_barrier()
        sleep(0.1)

예제 #8
0
code_package, files_uploaded = upload_code_package(".",
                                                   excludes=excluded_files)
print("Uploaded {} files.".format(len(files_uploaded)))

for noise_level in ["0.00", "0.25", "0.50", "1.00"]:
    cfg = dict(
        pretrained_noise_level=noise_level,
        model_path=
        f"/raw/vogels/locuslab-smoothing-pretrained-models/imagenet/resnet50/noise_{noise_level}/checkpoint.pth.tar",
    )
    job_id = register_job(
        user="******",
        project="adversarial-transfer-learning",
        experiment="does-random-noise-also-help",
        job=f"noise_{noise_level}",
        priority=10,
        config_overrides=cfg,
        runtime_environment={
            "clone": {
                "code_package": code_package
            },
            "script": "main.py"
        },
        annotations={
            "description":
            "Using pretrained ImageNet models from https://github.com/locuslab/smoothing, we want to see if Gaussian input perturbations have the same effect as adversarial ones."
        },
    )

    print(f"jobrun {job_id}")