Пример #1
0
    def to_air_checkpoint(self) -> Optional[Checkpoint]:
        from ray.tune.trainable.util import TrainableUtil

        checkpoint_data = self.dir_or_data

        if not checkpoint_data:
            return None

        if isinstance(checkpoint_data, ray.ObjectRef):
            checkpoint_data = ray.get(checkpoint_data)

        if isinstance(checkpoint_data, str):
            checkpoint_dir = TrainableUtil.find_checkpoint_dir(checkpoint_data)
            checkpoint = Checkpoint.from_directory(checkpoint_dir)
        elif isinstance(checkpoint_data, bytes):
            with tempfile.TemporaryDirectory() as tmpdir:
                TrainableUtil.create_from_pickle(checkpoint_data, tmpdir)
                # Double wrap in checkpoint so we hold the data in memory and
                # can remove the temp directory
                checkpoint = Checkpoint.from_dict(
                    Checkpoint.from_directory(tmpdir).to_dict())
        elif isinstance(checkpoint_data, dict):
            checkpoint = Checkpoint.from_dict(checkpoint_data)
        else:
            raise RuntimeError(
                f"Unknown checkpoint data type: {type(checkpoint_data)}")

        return checkpoint
Пример #2
0
def test_retry(ray_start_4_cpus):
    def train_func():
        ckpt = session.get_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            ckpt = ckpt.to_dict()
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            session.report(
                dict(test=i, training_iteration=i),
                checkpoint=Checkpoint.from_dict(dict(iter=i)),
            )

    trainer = DataParallelTrainer(
        train_func,
        backend_config=TestConfig(),
        scaling_config=ScalingConfig(num_workers=1),
    )
    tuner = Tuner(
        trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3))
    )

    analysis = tuner.fit()._experiment_analysis
    checkpoint_path = analysis.trials[0].checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 3

    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 4
Пример #3
0
def test_reuse_checkpoint(ray_start_4_cpus):
    def train_func(config):
        itr = 0
        ckpt = session.get_checkpoint()
        if ckpt is not None:
            ckpt = ckpt.to_dict()
            itr = ckpt["iter"] + 1

        for i in range(itr, config["max_iter"]):
            session.report(
                dict(test=i, training_iteration=i),
                checkpoint=Checkpoint.from_dict(dict(iter=i)),
            )

    trainer = DataParallelTrainer(
        train_func,
        backend_config=TestConfig(),
        scaling_config=ScalingConfig(num_workers=1),
    )
    tuner = Tuner(
        trainer,
        param_space={"train_loop_config": {"max_iter": 5}},
    )
    [trial] = tuner.fit()._experiment_analysis.trials
    checkpoint_path = trial.checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 4

    tuner = Tuner(
        trainer,
        param_space={"train_loop_config": {"max_iter": 10}},
    ).restore(trial.local_dir)
    analysis = tuner.fit()._experiment_analysis
    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 5
Пример #4
0
 def __init__(
     self, checkpoint: Checkpoint, predictor_cls: Type[Predictor], **predictor_kwargs
 ):
     # Store as object ref so we only serialize it once for all map workers
     self.checkpoint_ref = checkpoint.to_object_ref()
     self.predictor_cls = predictor_cls
     self.predictor_kwargs = predictor_kwargs
Пример #5
0
def load_checkpoint_from_path(checkpoint_to_load: Union[str, Path]) -> Dict:
    """Utility function to load a checkpoint Dict from a path."""
    checkpoint_path = Path(checkpoint_to_load).expanduser()
    if not checkpoint_path.exists():
        raise ValueError(f"Checkpoint path {checkpoint_path} does not exist.")
    checkpoint = Checkpoint.from_directory(str(checkpoint_path))
    return checkpoint.to_dict()
Пример #6
0
    def restore(self, trial: Trial) -> None:
        """Restores training state from a given model checkpoint.

        Args:
            trial: The trial to be restored.

        Raises:
            RuntimeError: This error is raised if no runner is found.
            AbortTrialExecution: This error is raised if the trial is
                ineligible for restoration, given the Tune input arguments.
        """
        checkpoint = trial.checkpoint
        if checkpoint.dir_or_data is None:
            return
        if trial.runner is None:
            raise RuntimeError(
                "Trial {}: Unable to restore - no runner found.".format(trial))
        checkpoint_dir = checkpoint.dir_or_data
        node_ip = checkpoint.node_ip
        if checkpoint.storage_mode == CheckpointStorage.MEMORY:
            logger.debug("Trial %s: Attempting restore from object", trial)
            # Note that we don't store the remote since in-memory checkpoints
            # don't guarantee fault tolerance and don't need to be waited on.
            with self._change_working_directory(trial):
                trial.runner.restore_from_object.remote(checkpoint_dir)
        else:
            logger.debug("Trial %s: Attempting restore from %s", trial,
                         checkpoint_dir)
            if (trial.uses_cloud_checkpointing or not trial.sync_on_checkpoint
                    or not os.path.exists(checkpoint_dir)):
                # If using cloud checkpointing, trial will get cp from cloud.
                # If not syncing to driver, assume it has access to the cp
                # on the local fs.
                with self._change_working_directory(trial):
                    remote = trial.runner.restore.remote(
                        checkpoint_dir, node_ip)
            elif trial.sync_on_checkpoint:
                # This provides FT backwards compatibility in the
                # case where no cloud checkpoints are provided.
                logger.debug("Trial %s: Reading checkpoint into memory", trial)
                checkpoint_path = TrainableUtil.find_checkpoint_dir(
                    checkpoint_dir)
                obj = Checkpoint.from_directory(checkpoint_path).to_bytes()
                with self._change_working_directory(trial):
                    remote = trial.runner.restore_from_object.remote(obj)
            else:
                raise _AbortTrialExecution(
                    "Pass in `sync_on_checkpoint=True` for driver-based trial"
                    "restoration. Pass in an `upload_dir` for remote "
                    "storage-based restoration")

            self._futures[remote] = (_ExecutorEventType.RESTORING_RESULT,
                                     trial)
            trial.restoring_from = checkpoint
Пример #7
0
    def train_func(config):
        itr = 0
        ckpt = session.get_checkpoint()
        if ckpt is not None:
            ckpt = ckpt.to_dict()
            itr = ckpt["iter"] + 1

        for i in range(itr, config["max_iter"]):
            session.report(
                dict(test=i, training_iteration=i),
                checkpoint=Checkpoint.from_dict(dict(iter=i)),
            )
Пример #8
0
def test_tune_checkpoint_legacy(ray_start_4_cpus):
    def train_func():
        for i in range(10):
            train.report(test=i)
        train.save_checkpoint(hello="world")

    trainer = Trainer(TestConfig(), num_workers=1)
    TestTrainable = trainer.to_tune_trainable(train_func)

    [trial] = tune.run(TestTrainable).trials
    checkpoint_path = trial.checkpoint.dir_or_data
    assert os.path.exists(checkpoint_path)
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["hello"] == "world"
Пример #9
0
    def to_air_checkpoint(self) -> Optional[Checkpoint]:
        from ray.tune.trainable.util import TrainableUtil

        checkpoint_data = self.dir_or_data

        if not checkpoint_data:
            return None

        if isinstance(checkpoint_data, ray.ObjectRef):
            checkpoint_data = ray.get(checkpoint_data)

        if isinstance(checkpoint_data, str):
            try:
                checkpoint_dir = TrainableUtil.find_checkpoint_dir(
                    checkpoint_data)
            except FileNotFoundError:
                if log_once("checkpoint_not_available"):
                    logger.error(
                        f"The requested checkpoint is not available on this node, "
                        f"most likely because you are using Ray client or disabled "
                        f"checkpoint synchronization. To avoid this, enable checkpoint "
                        f"synchronization to cloud storage by specifying a "
                        f"`SyncConfig`. The checkpoint may be available on a different "
                        f"node - please check this location on worker nodes: "
                        f"{checkpoint_data}")
                return None
            checkpoint = Checkpoint.from_directory(checkpoint_dir)
        elif isinstance(checkpoint_data, bytes):
            checkpoint = Checkpoint.from_bytes(checkpoint_data)
        elif isinstance(checkpoint_data, dict):
            checkpoint = Checkpoint.from_dict(checkpoint_data)
        else:
            raise RuntimeError(
                f"Unknown checkpoint data type: {type(checkpoint_data)}")

        return checkpoint
Пример #10
0
    def train_func():
        ckpt = session.get_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            ckpt = ckpt.to_dict()
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            session.report(
                dict(test=i, training_iteration=i),
                checkpoint=Checkpoint.from_dict(dict(iter=i)),
            )
Пример #11
0
def test_resume_from_checkpoint():
    with pytest.raises(ValueError):
        DummyTrainer(resume_from_checkpoint="invalid")

    with pytest.raises(ValueError):
        DummyTrainer(resume_from_checkpoint=False)

    with pytest.raises(ValueError):
        DummyTrainer(resume_from_checkpoint=True)

    with pytest.raises(ValueError):
        DummyTrainer(resume_from_checkpoint={})

    # Succeed
    DummyTrainer(resume_from_checkpoint=None)

    # Succeed
    DummyTrainer(resume_from_checkpoint=Checkpoint.from_dict({"empty": ""}))
Пример #12
0
    def commit(self, path: Optional[Path] = None) -> None:
        """Commit checkpoint to disk, if needed.

        Args:
            path: Path to commit checkpoint to.
        """
        if self.storage_mode == CheckpointStorage.MEMORY:
            # Do not persist memory checkpoints
            return

        if not path:
            # If no path is given, skip
            return

        if not isinstance(self.dir_or_data, dict):
            # Only persist dictionaries
            return

        checkpoint = Checkpoint.from_dict(self.dir_or_data)
        self.dir_or_data = checkpoint.to_directory(str(path))
Пример #13
0
def test_reuse_checkpoint_legacy(ray_start_4_cpus):
    def train_func(config):
        itr = 0
        ckpt = train.load_checkpoint()
        if ckpt is not None:
            itr = ckpt["iter"] + 1

        for i in range(itr, config["max_iter"]):
            train.save_checkpoint(iter=i)
            train.report(test=i, training_iteration=i)

    trainer = Trainer(TestConfig(), num_workers=1)
    TestTrainable = trainer.to_tune_trainable(train_func)

    [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials
    checkpoint_path = trial.checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 4
    analysis = tune.run(TestTrainable, config={"max_iter": 10}, restore=checkpoint_path)
    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 5
Пример #14
0
def test_tune_checkpoint(ray_start_4_cpus):
    def train_func():
        for i in range(9):
            session.report(dict(test=i))
        session.report(
            dict(test=i + 1), checkpoint=Checkpoint.from_dict(dict(hello="world"))
        )

    trainer = DataParallelTrainer(
        train_func,
        backend_config=TestConfig(),
        scaling_config=ScalingConfig(num_workers=1),
    )
    tuner = Tuner(
        trainer,
        param_space={"train_loop_config": {"max_iter": 5}},
    )

    [trial] = tuner.fit()._experiment_analysis.trials
    checkpoint_path = trial.checkpoint.dir_or_data
    assert os.path.exists(checkpoint_path)
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["hello"] == "world"
Пример #15
0
def test_retry_legacy(ray_start_4_cpus):
    def train_func():
        ckpt = train.load_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            train.save_checkpoint(iter=i)
            train.report(test=i, training_iteration=i)

    trainer = Trainer(TestConfig(), num_workers=1)
    TestTrainable = trainer.to_tune_trainable(train_func)

    analysis = tune.run(TestTrainable, max_failures=3)
    checkpoint_path = analysis.trials[0].checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 3

    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 4
Пример #16
0
def train_func(config):
    use_gpu = config["use_gpu"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_layers = config["num_layers"]
    num_hidden = config["num_hidden"]
    dropout_every = config["dropout_every"]
    dropout_prob = config["dropout_prob"]
    num_features = config["num_features"]

    print("Defining model, loss, and optimizer...")

    # Setup device.
    device = torch.device(f"cuda:{session.get_local_rank()}"
                          if use_gpu and torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Setup data.
    train_dataset_pipeline = session.get_dataset_shard("train")
    train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs()
    test_dataset = session.get_dataset_shard("test")
    test_torch_dataset = test_dataset.to_torch(label_column="label",
                                               batch_size=batch_size,
                                               drop_last=True)

    net = Net(
        n_layers=num_layers,
        n_features=num_features,
        num_hidden=num_hidden,
        dropout_every=dropout_every,
        drop_prob=dropout_prob,
    ).to(device)
    print(net.parameters)

    net = train.torch.prepare_model(net)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), weight_decay=0.0001)

    print("Starting training...")
    for epoch in range(num_epochs):
        train_dataset = next(train_dataset_epoch_iterator)

        train_torch_dataset = train_dataset.to_torch(label_column="label",
                                                     batch_size=batch_size)

        train_running_loss, train_num_correct, train_num_total = train_epoch(
            train_torch_dataset, net, device, criterion, optimizer)
        train_acc = train_num_correct / train_num_total
        print(f"epoch [{epoch + 1}]: training accuracy: "
              f"{train_num_correct} / {train_num_total} = {train_acc:.4f}")

        test_running_loss, test_num_correct, test_num_total = test_epoch(
            test_torch_dataset, net, device, criterion)
        test_acc = test_num_correct / test_num_total
        print(f"epoch [{epoch + 1}]: testing accuracy: "
              f"{test_num_correct} / {test_num_total} = {test_acc:.4f}")

        # Checkpoint model.
        module = net.module if isinstance(net,
                                          DistributedDataParallel) else net
        checkpoint = Checkpoint.from_dict(dict(model=module.state_dict()))

        # Record and log stats.
        print(f"session report on {session.get_world_rank()}")
        session.report(
            dict(
                train_acc=train_acc,
                train_loss=train_running_loss,
                test_acc=test_acc,
                test_loss=test_running_loss,
            ),
            checkpoint=checkpoint,
        )
Пример #17
0
 def __init__(self):
     checkpoint = Checkpoint.from_object_ref(checkpoint_ref)
     self.predictor = predictor_cls.from_checkpoint(
         checkpoint, **predictor_kwargs
     )
Пример #18
0
 def train_func():
     for i in range(9):
         session.report(dict(test=i))
     session.report(
         dict(test=i + 1), checkpoint=Checkpoint.from_dict(dict(hello="world"))
     )
Пример #19
0
def function_trainable_directory(config):
    tmpdir = tempfile.mkdtemp("checkpoint_test")
    with open(os.path.join(tmpdir, "data.json"), "w") as f:
        json.dump({"checkpoint_data": 5}, f)
    session.report({"metric": 4}, checkpoint=Checkpoint.from_directory(tmpdir))
Пример #20
0
def function_trainable_dict(config):
    session.report({"metric": 2},
                   checkpoint=Checkpoint.from_dict({"checkpoint_data": 3}))