def main(smoke_test: bool = False):
    ray.init(address="auto")

    num_samples = 16
    results_per_second = 1 / 60
    trial_length_s = 86400 if smoke_test else 3600

    max_runtime = 90000 if smoke_test else 4200

    callback = ProgressCallback()

    timed_tune_run(
        name="long running large checkpoints",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=900,  # Once every 15 minutes
        checkpoint_size_b=int(0.75 * 1000**3),
        keep_checkpoints_num=2,  # 2 * 16 * 4 = 128 GB
        resources_per_trial={"cpu": 1},
        sync_config=tune.SyncConfig(syncer="auto"),
        callbacks=[callback])
예제 #2
0
        # distribution for resampling
        "lr": lambda: np.random.uniform(0.001, 1),
        # allow perturbations within this set of categorical values
        "momentum": [0.8, 0.9, 0.99],
    })

reporter = CLIReporter()
reporter.add_metric_column("val_loss", "loss")
reporter.add_metric_column("val_accuracy", "acc")

analysis = tune.run(
    TorchTrainable,
    num_samples=4,
    config={
        "lr": tune.choice([0.001, 0.01, 0.1]),
        "momentum": 0.8,
        "head_location": None,
        "worker_locations": None
    },
    max_failures=-1,  # used for fault tolerance
    checkpoint_freq=2,  # used for fault tolerance
    progress_reporter=reporter,
    scheduler=pbt_scheduler,
    callbacks=[
        FailureInjectorCallback(time_between_checks=90),
        ProgressCallback()
    ],
    stop={"training_iteration": 1} if args.smoke_test else None)

print(analysis.get_best_config(metric="val_loss", mode="min"))
예제 #3
0
    # ensure that checkpointing works.
    pbt = create_scheduler(
        "pbt",
        perturbation_interval=2,
        hyperparam_mutations={
            "lr": tune.uniform(0.001, 0.1),
        },
    )

    analysis = tune.run(
        horovod_trainable,
        metric="loss",
        mode="min",
        keep_checkpoints_num=1,
        scheduler=pbt,
        config={
            "lr":
            0.1 if args.smoke_test else tune.grid_search(
                [0.1 * i for i in range(1, 10)]),
            "batch_size":
            64,
            "data":
            ray.put(dataset),
        },
        num_samples=1,
        stop={"training_iteration": 1} if args.smoke_test else None,
        callbacks=[ProgressCallback()],  # FailureInjectorCallback()
        fail_fast=True,
    )
    print("Best hyperparameters found were: ", analysis.best_config)
예제 #4
0
#         num_gpus=0,
#         resources={str(i): 2},
#         object_store_memory=object_store_memory,
#         redis_max_memory=redis_max_memory,
#         dashboard_host="0.0.0.0")
# ray.init(address=cluster.address)

if "RAY_ADDRESS" in os.environ:
    del os.environ["RAY_ADDRESS"]

ray.init(num_cpus=10)
# Run the workload.

run_experiments(
    {
        "impala": {
            "run": "IMPALA",
            "env": "CartPole-v0",
            "config": {
                "num_workers": 8,
                "num_gpus": 0,
                "num_envs_per_worker": 5,
                "remote_worker_envs": True,
                "remote_env_batch_wait_ms": 99999999,
                "rollout_fragment_length": 50,
                "train_batch_size": 100,
            },
        },
    },
    callbacks=[ProgressCallback()])
예제 #5
0
    hyperparam_mutations={
        # distribution for resampling
        "lr": lambda: np.random.uniform(0.001, 1),
        # allow perturbations within this set of categorical values
        "momentum": [0.8, 0.9, 0.99],
    },
)

reporter = CLIReporter()
reporter.add_metric_column("val_loss", "loss")
reporter.add_metric_column("val_accuracy", "acc")

analysis = tune.run(
    TorchTrainable,
    num_samples=4,
    config={
        "lr": tune.choice([0.001, 0.01, 0.1]),
        "momentum": 0.8,
        "head_location": None,
        "worker_locations": None,
    },
    max_failures=-1,  # used for fault tolerance
    checkpoint_freq=2,  # used for fault tolerance
    progress_reporter=reporter,
    scheduler=pbt_scheduler,
    callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()],
    stop={"training_iteration": 1} if args.smoke_test else None,
)

print(analysis.get_best_config(metric="val_loss", mode="min"))