예제 #1
0
 def get_dataset():
     # Train dataset should be sharded.
     train_dataset = session.get_dataset_shard("train")
     assert train_dataset.count() == num_train_data / scale_config.num_workers
     # All other datasets should not be sharded.
     val_dataset = session.get_dataset_shard("val")
     assert val_dataset.count() == num_val_data
예제 #2
0
 def train_loop_per_worker():
     data_shard = session.get_dataset_shard("train")
     if expect_ds:
         assert isinstance(data_shard, Dataset), data_shard
     else:
         assert isinstance(data_shard, DatasetPipeline), data_shard
     for k, v in expect_sizes.items():
         shard = session.get_dataset_shard(k)
         if v == -1:
             assert shard is None, shard
         else:
             if isinstance(shard, DatasetPipeline):
                 assert next(shard.iter_epochs()).count() == v, shard
             else:
                 assert shard.count() == v, shard
예제 #3
0
def train_func(config: dict):
    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_model()
        multi_worker_model.compile(
            optimizer=tf.keras.optimizers.SGD(
                learning_rate=config.get("lr", 1e-3)),
            loss=tf.keras.losses.mean_squared_error,
            metrics=[tf.keras.metrics.mean_squared_error],
        )

    dataset = session.get_dataset_shard("train")

    for _ in range(config.get("epoch", 3)):
        tf_dataset = prepare_dataset_shard(
            dataset.to_tf(
                label_column="y",
                output_signature=(
                    tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
                    tf.TensorSpec(shape=(None), dtype=tf.float32),
                ),
                batch_size=32,
            ))
        multi_worker_model.fit(tf_dataset, callbacks=[Callback()])
예제 #4
0
 def train_loop_per_worker():
     data_shard = session.get_dataset_shard("train")
     assert isinstance(data_shard, DatasetPipeline), data_shard
     results = []
     for epoch in data_shard.iter_epochs(2):
         results.append(epoch.take())
     check_results_fn(data_shard, results)
예제 #5
0
def train_func(config):
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset_shard = session.get_dataset_shard("train")
    validation_dataset = session.get_dataset_shard("validation")

    model = nn.Linear(1, hidden_size)
    model = train.torch.prepare_model(model)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    for _ in range(epochs):
        train_torch_dataset = train_dataset_shard.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )
        validation_torch_dataset = validation_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )

        device = train.torch.get_device()

        train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
        if session.get_world_rank() == 0:
            result = validate_epoch(validation_torch_dataset, model, loss_fn,
                                    device)
        else:
            result = {}
        results.append(result)
        session.report(result,
                       checkpoint=Checkpoint.from_dict(dict(model=model)))

    return results
예제 #6
0
def train_loop_per_worker():
    # By default, bulk loading is used and returns a Dataset object.
    data_shard = session.get_dataset_shard("train")

    # Manually iterate over the data 10 times (10 epochs).
    for _ in range(10):
        for batch in data_shard.iter_batches():
            print("Do some training on batch", batch)
예제 #7
0
def train_func(config):
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset_pipeline_shard = session.get_dataset_shard("train")
    validation_dataset_pipeline_shard = session.get_dataset_shard("validation")

    model = nn.Linear(1, hidden_size)
    model = train.torch.prepare_model(model)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    train_dataset_iterator = train_dataset_pipeline_shard.iter_epochs()
    validation_dataset_iterator = validation_dataset_pipeline_shard.iter_epochs(
    )

    for _ in range(epochs):
        train_dataset = next(train_dataset_iterator)
        validation_dataset = next(validation_dataset_iterator)

        train_torch_dataset = train_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )
        validation_torch_dataset = validation_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )

        device = train.torch.get_device()

        train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
        result = validate_epoch(validation_torch_dataset, model, loss_fn,
                                device)
        session.report(result)
예제 #8
0
def train_loop_per_worker():
    # A DatasetPipeline object is returned when `use_stream_api` is set.
    data_shard: DatasetPipeline = session.get_dataset_shard("train")

    # Use iter_epochs(10) to iterate over 10 epochs of data.
    for epoch in data_shard.iter_epochs(10):
        for batch in epoch.iter_batches():
            print("Do some training on batch", batch)

    # View the stats for performance debugging.
    print(data_shard.stats())
예제 #9
0
def train_loop_per_worker(config):
    raw_model = resnet18(pretrained=True)
    model = train.torch.prepare_model(raw_model)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    train_dataset_shard = session.get_dataset_shard("train")

    for epoch in range(config["num_epochs"]):
        running_loss = 0.0
        for i, data in enumerate(
                train_dataset_shard.iter_batches(
                    batch_size=config["batch_size"], batch_format="numpy")):
            # get the inputs; data is a list of [inputs, labels]
            inputs = torch.as_tensor(data["image"],
                                     dtype=torch.float32).to(device="cuda")
            labels = torch.as_tensor(data["label"],
                                     dtype=torch.int64).to(device="cuda")
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print(
                    f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}"
                )
                running_loss = 0.0

        session.report(
            dict(running_loss=running_loss),
            checkpoint=TorchCheckpoint.from_model(model),
        )
예제 #10
0
def train_func(config: dict):

    per_worker_batch_size = config.get("batch_size", 64)
    epochs = config.get("epochs", 3)

    dataset_shard = session.get_dataset_shard("train")

    strategy = tf.distribute.MultiWorkerMirroredStrategy()

    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_autoencoder_model()
        learning_rate = config.get("lr", 0.001)
        multi_worker_model.compile(
            loss=tf.keras.losses.BinaryCrossentropy(),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            metrics=[
                "binary_crossentropy",
            ],
        )

    results = []
    for epoch in range(epochs):
        tf_dataset = prepare_dataset_shard(
            dataset_shard.to_tf(
                feature_columns=["image"],
                label_column="label",
                output_signature=(
                    tf.TensorSpec(shape=(None, 784), dtype=tf.float32),
                    tf.TensorSpec(shape=(None, 784), dtype=tf.float32),
                ),
                batch_size=per_worker_batch_size,
            ))
        history = multi_worker_model.fit(
            tf_dataset, callbacks=[TrainCheckpointReportCallback()])
        results.append(history.history)
    return results
예제 #11
0
def train_func(config):
    batch_size = config.get("batch_size", 64)
    epochs = config.get("epochs", 3)

    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_and_compile_model(config)

    dataset_pipeline = session.get_dataset_shard("train")
    dataset_iterator = dataset_pipeline.iter_epochs()

    for _ in range(epochs):
        dataset = next(dataset_iterator)
        tf_dataset = prepare_dataset_shard(
            dataset.to_tf(
                label_column="y",
                output_signature=(
                    tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
                    tf.TensorSpec(shape=(None), dtype=tf.float32),
                ),
                batch_size=batch_size,
            ))
        multi_worker_model.fit(tf_dataset, callbacks=[TrainReportCallback()])
예제 #12
0
 def train_loop_per_worker():
     data_shard = session.get_dataset_shard("train")
     assert isinstance(data_shard, Dataset), data_shard
     results = data_shard.take()
     check_results_fn(data_shard, results)
예제 #13
0
파일: check_ingest.py 프로젝트: parasj/ray
        def train_loop_per_worker():
            import pandas as pd

            rank = session.get_world_rank()
            data_shard = session.get_dataset_shard("train")
            start = time.perf_counter()
            epochs_read, batches_read, bytes_read = 0, 0, 0
            batch_delays = []

            def generate_epochs(data: Union[Dataset, DatasetPipeline],
                                epochs: int):
                if isinstance(data, DatasetPipeline):
                    for epoch in data_shard.iter_epochs(epochs):
                        yield epoch
                else:
                    # Dataset
                    for _ in range(epochs):
                        yield data

            print("Starting train loop on worker", rank)
            for epoch_data in generate_epochs(data_shard, num_epochs):
                epochs_read += 1
                batch_start = time.perf_counter()
                for batch in epoch_data.iter_batches(
                        prefetch_blocks=prefetch_blocks,
                        batch_size=batch_size):
                    batch_delay = time.perf_counter() - batch_start
                    batch_delays.append(batch_delay)
                    batches_read += 1
                    if isinstance(batch, pd.DataFrame):
                        bytes_read += int(
                            batch.memory_usage(index=True, deep=True).sum())
                    elif isinstance(batch, np.ndarray):
                        bytes_read += batch.nbytes
                    else:
                        # NOTE: This isn't recursive and will just return the size of
                        # the object pointers if list of non-primitive types.
                        bytes_read += sys.getsizeof(batch)
                    session.report(
                        dict(
                            bytes_read=bytes_read,
                            batches_read=batches_read,
                            epochs_read=epochs_read,
                            batch_delay=batch_delay,
                        ))
                    batch_start = time.perf_counter()
            delta = time.perf_counter() - start

            print("Time to read all data", delta, "seconds")
            print(
                "P50/P95/Max batch delay (s)",
                np.quantile(batch_delays, 0.5),
                np.quantile(batch_delays, 0.95),
                np.max(batch_delays),
            )
            print("Num epochs read", epochs_read)
            print("Num batches read", batches_read)
            print("Num bytes read", round(bytes_read / (1024 * 1024), 2),
                  "MiB")
            print("Mean throughput",
                  round(bytes_read / (1024 * 1024) / delta, 2), "MiB/s")

            if rank == 0:
                print("Ingest stats from rank=0:\n\n{}".format(
                    data_shard.stats()))
예제 #14
0
def _huggingface_train_loop_per_worker(config):
    """Per-worker training loop for HuggingFace Transformers."""
    trainer_init_per_worker = config.pop("_trainer_init_per_worker")

    # Env vars necessary for HF to setup DDP
    os.environ["RANK"] = str(session.get_world_rank())
    os.environ["WORLD_SIZE"] = str(session.get_world_size())
    os.environ["LOCAL_RANK"] = str(session.get_local_rank())

    train_dataset = session.get_dataset_shard(TRAIN_DATASET_KEY)
    eval_dataset = session.get_dataset_shard(EVALUATION_DATASET_KEY)

    train_torch_dataset, eval_torch_dataset = process_datasets(
        train_dataset,
        eval_dataset,
    )

    trainer: transformers.trainer.Trainer = trainer_init_per_worker(
        train_torch_dataset, eval_torch_dataset, **config)

    if trainer.args.push_to_hub and not trainer.args.hub_token:
        warnings.warn(
            "You have set `push_to_hub=True` but didn't specify `hub_token`. "
            "Pushing to hub will most likely fail, as the credentials will not "
            "be automatically propagated from the local enviroment to the Ray Actors. "
            "If that happens, specify `hub_token` in `TrainingArguments`.")

    if (trainer.args.evaluation_strategy == "steps"
            or trainer.args.save_strategy == "steps"
            or trainer.args.logging_strategy == "steps"):
        raise ValueError(
            "'steps' value for `evaluation_strategy`, `logging_strategy` "
            "or `save_strategy` is not yet supported.")

    trainer = wrap_transformers_trainer(trainer)

    # ensure no HF logging callbacks are added
    # aside from doubling functionality with our callbacks,
    # the Wandb callbacks causes training to freeze
    integration_callbacks = transformers.trainer.get_reporting_integration_callbacks(
        trainer.args.report_to)
    for callback in integration_callbacks:
        trainer.pop_callback(callback)

    trainer.add_callback(TrainReportCallback)

    checkpoint = session.get_checkpoint()
    checkpoint_path = None
    remove_checkpoint_path = False
    if checkpoint:
        assert isinstance(checkpoint, Checkpoint)
        checkpoint_dict = checkpoint.to_dict()
        source_ip = checkpoint_dict[NODE_IP_KEY]
        source_path = checkpoint_dict[CHECKPOINT_PATH_ON_NODE_KEY]
        target_ip = get_node_ip_address()
        if source_ip == target_ip:
            checkpoint_path = source_path
        else:
            checkpoint_path = tempfile.mkdtemp(
                suffix=Path(trainer.args.output_dir).name)
            remove_checkpoint_path = True
            sync_dir_between_nodes(
                source_ip=source_ip,
                source_path=source_path,
                target_ip=target_ip,
                target_path=checkpoint_path,
                return_futures=False,
                max_size_bytes=None,
            )
    trainer.train(resume_from_checkpoint=checkpoint_path)
    if remove_checkpoint_path:
        shutil.rmtree(checkpoint_path, ignore_errors=True)
예제 #15
0
def train_func(config):
    use_gpu = config["use_gpu"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_layers = config["num_layers"]
    num_hidden = config["num_hidden"]
    dropout_every = config["dropout_every"]
    dropout_prob = config["dropout_prob"]
    num_features = config["num_features"]

    print("Defining model, loss, and optimizer...")

    # Setup device.
    device = torch.device(f"cuda:{session.get_local_rank()}"
                          if use_gpu and torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Setup data.
    train_dataset_pipeline = session.get_dataset_shard("train")
    train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs()
    test_dataset = session.get_dataset_shard("test")
    test_torch_dataset = test_dataset.to_torch(label_column="label",
                                               batch_size=batch_size,
                                               drop_last=True)

    net = Net(
        n_layers=num_layers,
        n_features=num_features,
        num_hidden=num_hidden,
        dropout_every=dropout_every,
        drop_prob=dropout_prob,
    ).to(device)
    print(net.parameters)

    net = train.torch.prepare_model(net)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), weight_decay=0.0001)

    print("Starting training...")
    for epoch in range(num_epochs):
        train_dataset = next(train_dataset_epoch_iterator)

        train_torch_dataset = train_dataset.to_torch(label_column="label",
                                                     batch_size=batch_size)

        train_running_loss, train_num_correct, train_num_total = train_epoch(
            train_torch_dataset, net, device, criterion, optimizer)
        train_acc = train_num_correct / train_num_total
        print(f"epoch [{epoch + 1}]: training accuracy: "
              f"{train_num_correct} / {train_num_total} = {train_acc:.4f}")

        test_running_loss, test_num_correct, test_num_total = test_epoch(
            test_torch_dataset, net, device, criterion)
        test_acc = test_num_correct / test_num_total
        print(f"epoch [{epoch + 1}]: testing accuracy: "
              f"{test_num_correct} / {test_num_total} = {test_acc:.4f}")

        # Checkpoint model.
        module = net.module if isinstance(net,
                                          DistributedDataParallel) else net
        checkpoint = Checkpoint.from_dict(dict(model=module.state_dict()))

        # Record and log stats.
        print(f"session report on {session.get_world_rank()}")
        session.report(
            dict(
                train_acc=train_acc,
                train_loss=train_running_loss,
                test_acc=test_acc,
                test_loss=test_running_loss,
            ),
            checkpoint=checkpoint,
        )