예제 #1
0
def train_torch_ray_air(
    *,
    config: dict,
    num_workers: int = 4,
    cpus_per_worker: int = 8,
    use_gpu: bool = False,
) -> Tuple[float, float]:
    # This function is kicked off by the main() function and runs a full training
    # run using Ray AIR.
    from ray.train.torch import TorchTrainer
    from ray.air.config import ScalingConfig

    def train_loop(config):
        train_func(use_ray=True, config=config)

    start_time = time.monotonic()
    trainer = TorchTrainer(
        train_loop_per_worker=train_loop,
        train_loop_config=config,
        scaling_config=ScalingConfig(
            trainer_resources={"CPU": 0},
            num_workers=num_workers,
            resources_per_worker={"CPU": cpus_per_worker},
            use_gpu=use_gpu,
        ),
    )
    result = trainer.fit()
    time_taken = time.monotonic() - start_time

    print(f"Last result: {result.metrics}")
    return time_taken, result.metrics["loss"]
예제 #2
0
def test_torch_e2e_state_dict(ray_start_4_cpus):
    def train_func():
        model = torch.nn.Linear(1, 1).state_dict()
        train.save_checkpoint(model=model)

    scaling_config = {"num_workers": 2}
    trainer = TorchTrainer(train_loop_per_worker=train_func,
                           scaling_config=scaling_config)
    result = trainer.fit()

    # If loading from a state dict, a model definition must be passed in.
    with pytest.raises(ValueError):
        TorchPredictor.from_checkpoint(result.checkpoint)

    class TorchScorer:
        def __init__(self):
            self.pred = TorchPredictor.from_checkpoint(result.checkpoint,
                                                       model=torch.nn.Linear(
                                                           1, 1))

        def __call__(self, x):
            return self.pred.predict(x, dtype=torch.float)

    predict_dataset = ray.data.range(3)
    predictions = predict_dataset.map_batches(TorchScorer,
                                              batch_format="pandas",
                                              compute="actors")
    assert predictions.count() == 3
예제 #3
0
def train_linear(num_workers=2, use_gpu=False, epochs=3):
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer = TorchTrainer(
        train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()

    print(results.metrics)
    return results
예제 #4
0
def train_fashion_mnist(num_workers=2, use_gpu=False):
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={
            "lr": 1e-3,
            "batch_size": 64,
            "epochs": 4
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    result = trainer.fit()
    print(f"Last result: {result.metrics}")
예제 #5
0
def main(data_size_gb: int, num_epochs=2, num_workers=1):
    data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw"
    print("Running Pytorch image model training with "
          f"{data_size_gb}GB data from {data_url}")
    print(f"Training for {num_epochs} epochs with {num_workers} workers.")
    start = time.time()
    # Enable cross host NCCL for larger scale tests
    runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
    ray.init(runtime_env=runtime_env)
    dataset = ray.data.read_datasource(ImageFolderDatasource(),
                                       paths=[data_url])

    preprocessor = BatchMapper(preprocess_image_with_label)

    trainer = TorchTrainer(
        train_loop_per_worker=train_loop_per_worker,
        train_loop_config={
            "batch_size": 64,
            "num_epochs": num_epochs
        },
        datasets={"train": dataset},
        preprocessor=preprocessor,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=True),
    )
    trainer.fit()

    total_time_s = round(time.time() - start, 2)

    # For structured output integration with internal tooling
    results = {"data_size_gb": data_size_gb, "num_epochs": num_epochs}
    results["perf_metrics"] = [
        {
            "perf_metric_name": "total_time_s",
            "perf_metric_value": total_time_s,
            "perf_metric_type": "LATENCY",
        },
        {
            "perf_metric_name":
            "throughout_MB_s",
            "perf_metric_value":
            round(num_epochs * data_size_gb * 1024 / total_time_s, 2),
            "perf_metric_type":
            "THROUGHPUT",
        },
    ]

    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
                                      "/tmp/release_test_out.json")
    with open(test_output_json, "wt") as f:
        json.dump(results, f)

    print(results)
예제 #6
0
def test_checkpoint_freq(ray_start_4_cpus):
    # checkpoint_freq is not supported so raise an error
    trainer = TorchTrainer(
        train_loop_per_worker=lambda config: None,
        scaling_config=ray.air.ScalingConfig(num_workers=1),
        run_config=ray.air.RunConfig(
            checkpoint_config=ray.air.CheckpointConfig(
                checkpoint_frequency=2,
            ),
        ),
    )
    with pytest.raises(TuneError):
        trainer.fit()
예제 #7
0
def train_linear(num_workers=2, use_gpu=False):
    datasets, dataset_configs = get_datasets_and_configs()

    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer = TorchTrainer(
        train_func,
        train_loop_config=config,
        datasets=datasets,
        dataset_config=dataset_configs,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()
    print(results.metrics)
    return results
예제 #8
0
def train_linear(num_workers=2, use_gpu=False, epochs=3):
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config={
            "num_workers": num_workers,
            "use_gpu": use_gpu
        },
    )
    result = trainer.fit()

    print(result.metrics)
    return result.metrics
예제 #9
0
def tune_linear(num_workers, num_samples, use_gpu):
    train_dataset, val_dataset = get_datasets()

    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}

    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
        datasets={
            "train": train_dataset,
            "validation": val_dataset
        },
    )

    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([4, 16, 32]),
                "epochs": 3,
            }
        },
        tune_config=TuneConfig(num_samples=num_samples,
                               metric="loss",
                               mode="min"),
    )
    result_grid = tuner.fit()
    best_result = result_grid.get_best_result()
    print(best_result)
    return best_result
예제 #10
0
 def test_tuner_with_torch_trainer(self):
     """Test a successful run using torch trainer."""
     shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"),
                   ignore_errors=True)
     # The following two should be tunable.
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10}
     scaling_config = {"num_workers": 1, "use_gpu": False}
     trainer = TorchTrainer(
         train_loop_per_worker=linear_train_func,
         train_loop_config=config,
         scaling_config=scaling_config,
     )
     param_space = {
         "scaling_config": {
             "num_workers": tune.grid_search([1, 2]),
         },
         "train_loop_config": {
             "batch_size": tune.grid_search([4, 8]),
             "epochs": tune.grid_search([5, 10]),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="loss"),
     )
     results = tuner.fit()
     assert len(results) == 8
예제 #11
0
def main(num_workers=2, use_gpu=False):
    trainer = TorchTrainer(
        train_func,
        train_loop_config={
            "lr": 1e-3,
            "batch_size": 64,
            "epochs": 4
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
        run_config=RunConfig(callbacks=[
            MLflowLoggerCallback(experiment_name="train_fashion_mnist")
        ]),
    )
    final_results = trainer.fit()

    print("Final metrics: ", final_results.metrics)
예제 #12
0
def test_torch_linear(ray_start_4_cpus, num_workers):
    def train_func(config):
        result = linear_train_func(config)
        assert len(result) == epochs
        assert result[-1]["loss"] < result[0]["loss"]

    num_workers = num_workers
    epochs = 3
    scaling_config = {"num_workers": num_workers}
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=scaling_config,
    )
    trainer.fit()
예제 #13
0
def train_linear(num_workers=2, use_gpu=False):
    train_dataset, val_dataset = get_datasets()
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}

    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
        datasets={
            "train": train_dataset,
            "validation": val_dataset
        },
    )

    result = trainer.fit()
    print(result.metrics)
    return result
예제 #14
0
def train_gnn(
    num_workers=2, use_gpu=False, epochs=3, global_batch_size=32, dataset="reddit"
):

    per_worker_batch_size = global_batch_size // num_workers

    trainer = TorchTrainer(
        train_loop_per_worker=train_loop_per_worker,
        train_loop_config={
            "num_epochs": epochs,
            "batch_size": per_worker_batch_size,
            "dataset_fn": gen_reddit_dataset
            if dataset == "reddit"
            else gen_fake_dataset(),
        },
        scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
    )
    result = trainer.fit()
    print(result.metrics)
예제 #15
0
def train_linear(num_workers=1,
                 num_hidden_layers=1,
                 use_auto_transfer=True,
                 epochs=3):
    config = {
        "lr": 1e-2,
        "hidden_size": num_hidden_layers,
        "batch_size": 4096,
        "epochs": epochs,
        "use_auto_transfer": use_auto_transfer,
    }
    trainer = TorchTrainer(
        train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(use_gpu=True, num_workers=num_workers),
    )
    results = trainer.fit()

    print(results.metrics)
    return results
예제 #16
0
def main():
    args = parse_args()
    config = {"args": args}

    if args.start_local or args.address or args.num_workers > 1 or args.use_gpu:
        if args.start_local:
            # Start a local Ray runtime.
            ray.init(num_cpus=args.num_workers + 2)
        else:
            # Connect to a Ray cluster for distributed training.
            ray.init(address=args.address)
        trainer = TorchTrainer(
            train_func,
            train_loop_config=config,
            scaling_config=ScalingConfig(num_workers=args.num_workers,
                                         use_gpu=args.use_gpu),
        )
        results = trainer.fit()
        print(results.metrics)
    else:
        # Run training locally.
        train_func(config)
예제 #17
0
def test_torch_e2e(ray_start_4_cpus):
    def train_func():
        model = torch.nn.Linear(1, 1)
        session.report({}, checkpoint=Checkpoint.from_dict(dict(model=model)))

    scaling_config = ScalingConfig(num_workers=2)
    trainer = TorchTrainer(
        train_loop_per_worker=train_func, scaling_config=scaling_config
    )
    result = trainer.fit()

    predict_dataset = ray.data.range(3)

    class TorchScorer:
        def __init__(self):
            self.pred = TorchPredictor.from_checkpoint(result.checkpoint)

        def __call__(self, x):
            return self.pred.predict(x, dtype=torch.float)

    predictions = predict_dataset.map_batches(
        TorchScorer, batch_format="pandas", compute="actors"
    )
    assert predictions.count() == 3
예제 #18
0
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"epoch: {epoch}, loss: {loss.item()}")


# __torch_distributed_end__

if __name__ == "__main__":
    # __torch_single_run_begin__

    train_func()

    # __torch_single_run_end__

    # __torch_trainer_begin__

    from ray.train.torch import TorchTrainer
    from ray.air.config import ScalingConfig

    # For GPU Training, set `use_gpu` to True.
    use_gpu = False

    trainer = TorchTrainer(train_func_distributed,
                           scaling_config=ScalingConfig(num_workers=4,
                                                        use_gpu=use_gpu))

    results = trainer.fit()

    # __torch_trainer_end__
예제 #19
0
parser = argparse.ArgumentParser()
parser.add_argument(
    "--smoke-test",
    action="store_true",
    default=False,
    help="Finish quickly for training.",
)
args = parser.parse_args()

ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True)
num_training_workers = 1 if args.smoke_test else 3

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(
        num_workers=num_training_workers,
        use_gpu=not args.smoke_test,
    ),
    torch_config=TorchConfig(backend="gloo"),
)

pbt_scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    perturbation_interval=1,
    hyperparam_mutations={
        "train_loop_config": {
            # distribution for resampling
            "lr": lambda: np.random.uniform(0.001, 1),
            # allow perturbations within this set of categorical values
            "momentum": [0.8, 0.9, 0.99],
        }
    },
예제 #20
0
# __config_1__
import ray
from ray.train.torch import TorchTrainer
from ray.air.config import DatasetConfig

train_ds = ray.data.range_tensor(1000)
valid_ds = ray.data.range_tensor(100)
test_ds = ray.data.range_tensor(100)

my_trainer = TorchTrainer(
    lambda: None,  # No-op training loop.
    scaling_config={"num_workers": 2},
    datasets={
        "train": train_ds,
        "valid": valid_ds,
        "test": test_ds,
    },
    dataset_config={
        "valid": DatasetConfig(split=True),
        "test": DatasetConfig(split=True),
    },
)
print(my_trainer.get_dataset_config())
# -> {'train': DatasetConfig(fit=True, split=True, ...),
#     'valid': DatasetConfig(fit=False, split=True, ...),
#     'test': DatasetConfig(fit=False, split=True, ...), ...}
# __config_1_end__

# __config_2__
import ray
from ray.train.torch import TorchTrainer
예제 #21
0
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    for _ in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        loss = validate_epoch(test_dataloader, model, loss_fn)
        train.report(loss=loss)


num_workers = 2
use_gpu = False

trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
    scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
)
result = trainer.fit()
print(f"Last result: {result.metrics}")
# __air_pytorch_train_end__


# # __air_pytorch_batchpred_start__
# import random
# from ray.air.batch_predictor import BatchPredictor
# from ray.air.predictors.integrations.torch import TorchPredictor

# batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, TorchPredictor)

# items = [{"x": random.uniform(0, 1) for _ in range(10)}]
예제 #22
0
            optimizer.step()
        loss = train_loss.item()
        session.report({"loss": loss},
                       checkpoint=TorchCheckpoint.from_model(model))


num_features = len(train_dataset.schema().names) - 1

trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config={
        "batch_size": 128,
        "num_epochs": 20,
        "num_features": num_features,
        "lr": 0.001,
    },
    scaling_config=ScalingConfig(
        num_workers=3,  # Number of workers to use for data parallelism.
        use_gpu=False,
        trainer_resources={"CPU": 0},  # so that the example works on Colab.
    ),
    datasets={"train": train_dataset},
    preprocessor=preprocessor,
)
# Execute training.
result = trainer.fit()
print(f"Last result: {result.metrics}")
# Last result: {'loss': 0.6559339960416158, ...}
# __air_pytorch_train_end__

# __air_pytorch_tuner_start__
from ray import tune
예제 #23
0
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    for _ in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        loss = validate_epoch(test_dataloader, model, loss_fn)
        session.report(dict(loss=loss))


num_workers = 2
use_gpu = False

trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config={
        "lr": 1e-3,
        "batch_size": 64,
        "epochs": 4
    },
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
)
result = trainer.fit()
print(f"Last result: {result.metrics}")
# __air_pytorch_train_end__

# # __air_pytorch_batchpred_start__
# import random
# from ray.train.batch_predictor import BatchPredictor
# from ray.train.torch import TorchPredictor

# batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, TorchPredictor)
예제 #24
0
import ray.data


def train_loop_per_worker():
    # By default, bulk loading is used and returns a Dataset object.
    data_shard = session.get_dataset_shard("train")

    # Manually iterate over the data 10 times (10 epochs).
    for _ in range(10):
        for batch in data_shard.iter_batches():
            print("Do some training on batch", batch)


trainer = TorchTrainer(
    train_loop_per_worker,
    scaling_config=ScalingConfig(num_workers=1),
    datasets={"train": ray.data.range_tensor(1000)},
)
trainer.fit()
# __config_scaling_1_end__

# __config_scaling_2__
from ray.air import session
from ray.train.torch import TorchTrainer
import ray.data
from ray.air.config import ScalingConfig
from ray import tune
from ray.tune.tuner import Tuner
from ray.tune.tune_config import TuneConfig

예제 #25
0
from ray.air import ScalingConfig, RunConfig, session
from ray.train.torch import TorchTrainer
from ray.tune.integration.mlflow import MLflowLoggerCallback
from ray.tune.logger import TBXLoggerCallback


def train_func():
    for i in range(3):
        session.report(dict(epoch=i))


trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=2),
    run_config=RunConfig(
        callbacks=[
            MLflowLoggerCallback(experiment_name="train_experiment"),
            TBXLoggerCallback(),
        ],
    ),
)

# Run the training function, logging all the intermediate results
# to MLflow and Tensorboard.
result = trainer.fit()

# For MLFLow logs:

# MLFlow logs will by default be saved in an `mlflow` directory
# in the current working directory.

# $ cd mlflow
예제 #26
0
        help="Finish quickly for testing.",
    )
    parser.add_argument("--use-gpu",
                        action="store_true",
                        default=False,
                        help="Enables GPU training")

    args, _ = parser.parse_known_args()
    if args.smoke_test:
        ray.init(num_cpus=4)
    else:
        ray.init(address=args.address)

    trainer = TorchTrainer(
        train_func,
        scaling_config=ScalingConfig(num_workers=args.num_workers,
                                     use_gpu=args.use_gpu),
    )
    pbt_scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        perturbation_interval=1,
        hyperparam_mutations={
            "train_loop_config": {
                # distribution for resampling
                "lr": lambda: np.random.uniform(0.001, 1),
                # allow perturbations within this set of categorical values
                "momentum": [0.8, 0.9, 0.99],
            }
        },
    )
예제 #27
0
# __config_1__
import ray
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig, DatasetConfig

train_ds = ray.data.range_tensor(1000)
valid_ds = ray.data.range_tensor(100)
test_ds = ray.data.range_tensor(100)

my_trainer = TorchTrainer(
    lambda: None,  # No-op training loop.
    scaling_config=ScalingConfig(num_workers=2),
    datasets={
        "train": train_ds,
        "valid": valid_ds,
        "test": test_ds,
    },
    dataset_config={
        "valid": DatasetConfig(split=True),
        "test": DatasetConfig(split=True),
    },
)
print(my_trainer.get_dataset_config())
# -> {'train': DatasetConfig(fit=True, split=True, ...),
#     'valid': DatasetConfig(fit=False, split=True, ...),
#     'test': DatasetConfig(fit=False, split=True, ...), ...}
# __config_1_end__

# __config_2__
import ray
from ray.train.torch import TorchTrainer