def __init__( self, local_path: Optional[str] = None, cloud_path: Optional[str] = None, ): _TrialCheckpoint.__init__(self) # Checkpoint does not allow empty data, but TrialCheckpoint # did. To keep backwards compatibility, we use a placeholder URI # here, and manually set self._uri and self._local_dir later. PLACEHOLDER = "s3://placeholder" Checkpoint.__init__(self, uri=PLACEHOLDER) # Reset local variables self._uri = None self._local_path = None self._cloud_path_tcp = None self._local_path_tcp = None locations = set() if local_path: # Add _tcp to not conflict with Checkpoint._local_path self._local_path_tcp = local_path if os.path.exists(local_path): self._local_path = local_path locations.add(local_path) if cloud_path: self._cloud_path_tcp = cloud_path self._uri = cloud_path locations.add(cloud_path) self._locations = locations
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def fit(self, dataset): self.fitted_ = True def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame": return df trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, preprocessor=DummyPreprocessor(), ) result = trainer.fit() # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) model, preprocessor = load_checkpoint(resume_from) assert get_num_trees(model) == 10 assert preprocessor.is_same assert preprocessor.fitted_
def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, ) result = trainer.fit() checkpoint = result.checkpoint model, _ = load_checkpoint(checkpoint) assert get_num_trees(model) == 5 # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, resume_from_checkpoint=resume_from, ) result = trainer.fit() checkpoint = result.checkpoint xgb_model, _ = load_checkpoint(checkpoint) assert get_num_trees(xgb_model) == 10
def _convert_directory_checkpoint_to_sync_if_needed( self, checkpoint: Checkpoint) -> Checkpoint: """Replace the directory checkpoint with a node ip & path dict checkpoint. This dict checkpoint will be used used to sync the directory. If we were to use a directory checkpoint directly, it would get deepcopied & serialized unnecessarily.""" with checkpoint.as_directory() as checkpoint_path: # Load checkpoint from path. checkpoint_path = Path(checkpoint_path).expanduser().absolute() if not checkpoint_path.joinpath(TUNE_CHECKPOINT_ID).exists(): # If the ID file is missing, we assume that this is already # a sync checkpoint dict_checkpoint = checkpoint.to_dict() if (NODE_IP_KEY not in dict_checkpoint or CHECKPOINT_PATH_ON_NODE_KEY not in dict_checkpoint): raise ValueError( "Wrong checkpoint format. Ensure the checkpoint is a " "result of `HuggingFaceTrainer`.") return checkpoint with open(checkpoint_path.joinpath(TUNE_CHECKPOINT_ID), "r") as f: tune_checkpoint_id = int(f.read()) return Checkpoint.from_dict({ NODE_IP_KEY: get_node_ip_address(), CHECKPOINT_PATH_ON_NODE_KEY: str(checkpoint_path), TUNE_CHECKPOINT_ID: tune_checkpoint_id, })
def testLocalCheckpointSerde(self): # Local checkpoints are converted to bytes on serialization. Currently # this is a pickled dict, so we compare with a dict checkpoint. source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) with source_checkpoint.as_directory() as tmpdir: checkpoint = Checkpoint.from_directory(tmpdir) self._testCheckpointSerde( checkpoint, *source_checkpoint.get_internal_representation())
def testObjRefCheckpointSerde(self): # Obj ref checkpoints are dict checkpoints put into the Ray object # store, but they have their own data representation (the obj ref). # We thus compare with the actual obj ref checkpoint. source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) obj_ref = source_checkpoint.to_object_ref() checkpoint = Checkpoint.from_object_ref(obj_ref) self._testCheckpointSerde(checkpoint, *checkpoint.get_internal_representation())
def testBytesCheckpointSerde(self): # Bytes checkpoints are just dict checkpoints constructed # from pickled data, so we compare with the source dict checkpoint. source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) blob = source_checkpoint.to_bytes() checkpoint = Checkpoint.from_bytes(blob) self._testCheckpointSerde( checkpoint, *source_checkpoint.get_internal_representation())
def testLocalCheckpointSerde(self): # Local checkpoints are converted to bytes on serialization. Currently # this is a pickled dict, so we compare with a dict checkpoint. source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) tmpdir = source_checkpoint.to_directory() self.addCleanup(shutil.rmtree, tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) self._testCheckpointSerde( checkpoint, *source_checkpoint.get_internal_representation())
def get_best_checkpoint( self, trial: Trial, metric: Optional[str] = None, mode: Optional[str] = None) -> Optional[Checkpoint]: """Gets best persistent checkpoint path of provided trial. Any checkpoints with an associated metric value of ``nan`` will be filtered out. Args: trial: The log directory of a trial, or a trial instance. metric: key of trial info to return, e.g. "mean_accuracy". "training_iteration" is used by default if no value was passed to ``self.default_metric``. mode: One of [min, max]. Defaults to ``self.default_mode``. Returns: :class:`Checkpoint <ray.ml.Checkpoint>` object. """ metric = metric or self.default_metric or TRAINING_ITERATION mode = self._validate_mode(mode) checkpoint_paths = self.get_trial_checkpoints_paths(trial, metric) # Filter out nan. Sorting nan values leads to undefined behavior. checkpoint_paths = [(path, metric) for path, metric in checkpoint_paths if not is_nan(metric)] if not checkpoint_paths: logger.error(f"No checkpoints have been found for trial {trial}.") return None a = -1 if mode == "max" else 1 best_path_metrics = sorted(checkpoint_paths, key=lambda x: a * x[1]) best_path, best_metric = best_path_metrics[0] cloud_path = self._parse_cloud_path(best_path) if self._legacy_checkpoint: return TrialCheckpoint(local_path=best_path, cloud_path=cloud_path) if cloud_path: # Prefer cloud path over local path for downsteam processing return Checkpoint.from_uri(cloud_path) elif os.path.exists(best_path): return Checkpoint.from_directory(best_path) else: logger.error( f"No checkpoint locations for {trial} available on " f"this node. To avoid this, you " f"should enable checkpoint synchronization with the" f"`sync_config` argument in Ray Tune. " f"The checkpoint may be available on a different node - " f"please check this location on worker nodes: {best_path}") return None
def from_checkpoint( cls, checkpoint: Checkpoint, *, pipeline: Optional[Type[Pipeline]] = None, **pipeline_kwargs, ) -> "HuggingFacePredictor": """Instantiate the predictor from a Checkpoint. The checkpoint is expected to be a result of ``HuggingFaceTrainer``. Args: checkpoint: The checkpoint to load the model and preprocessor from. It is expected to be from the result of a ``HuggingFaceTrainer`` run. pipeline: A ``transformers.pipelines.Pipeline`` class to use. If not specified, will use the ``pipeline`` abstraction wrapper. **pipeline_kwargs: Any kwargs to pass to the pipeline initialization. If ``pipeline`` is None, this must contain the 'task' argument. Cannot contain 'model'. """ if not pipeline and "task" not in pipeline_kwargs: raise ValueError( "If `pipeline` is not specified, 'task' must be passed as a kwarg." ) pipeline = pipeline or pipeline_factory with checkpoint.as_directory() as checkpoint_path: preprocessor = load_preprocessor_from_dir(checkpoint_path) pipeline = pipeline(model=checkpoint_path, **pipeline_kwargs) return HuggingFacePredictor( pipeline=pipeline, preprocessor=preprocessor, )
def create_checkpoint(preprocessor: Optional[Preprocessor] = None, config: Optional[dict] = None) -> Checkpoint: rl_trainer = RLTrainer( algorithm=_DummyTrainer, config=config or {}, preprocessor=preprocessor, ) rl_trainable_cls = rl_trainer.as_trainable() rl_trainable = rl_trainable_cls() with tempfile.TemporaryDirectory() as checkpoint_dir: checkpoint_file = rl_trainable.save(checkpoint_dir) checkpoint_path = TrainableUtil.find_checkpoint_dir(checkpoint_file) checkpoint_data = Checkpoint.from_directory(checkpoint_path).to_dict() return Checkpoint.from_dict(checkpoint_data)
def _load_checkpoint( checkpoint: Checkpoint, trainer_name: str ) -> Tuple[Any, Optional[Preprocessor]]: """Load a Ray Train Checkpoint. This is a private API. Args: checkpoint: The checkpoint to load the weights and preprocessor from. trainer_name: Trainer class name to use in error message. Returns: The model or weights and AIR preprocessor contained within. """ checkpoint_dict = checkpoint.to_dict() preprocessor = checkpoint_dict.get(PREPROCESSOR_KEY, None) if MODEL_KEY not in checkpoint_dict: raise RuntimeError( f"No item with key: {MODEL_KEY} is found in the " f"Checkpoint. Make sure this key exists when saving the " f"checkpoint in ``{trainer_name}``." ) model = checkpoint_dict[MODEL_KEY] return model, preprocessor
def from_checkpoint( cls, checkpoint: Checkpoint, model_definition: Union[Callable[[], tf.keras.Model], Type[tf.keras.Model]], ) -> "TensorflowPredictor": """Instantiate the predictor from a Checkpoint. The checkpoint is expected to be a result of ``TensorflowTrainer``. Args: checkpoint: The checkpoint to load the model and preprocessor from. It is expected to be from the result of a ``TensorflowTrainer`` run. model_definition: A callable that returns a TensorFlow Keras model to use. Model weights will be loaded from the checkpoint. """ checkpoint_dict = checkpoint.to_dict() preprocessor = checkpoint_dict.get(PREPROCESSOR_KEY, None) if MODEL_KEY not in checkpoint_dict: raise RuntimeError( f"No item with key: {MODEL_KEY} is found in the " f"Checkpoint. Make sure this key exists when saving the " f"checkpoint in ``TensorflowTrainer``.") model_weights = checkpoint_dict[MODEL_KEY] return TensorflowPredictor( model_definition=model_definition, model_weights=model_weights, preprocessor=preprocessor, )
def testDataCheckpointSerde(self): # Data checkpoints keep the same internal representation, including # their data. checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) self._testCheckpointSerde(checkpoint, *checkpoint.get_internal_representation())
def testUriCheckpointSerde(self): # URI checkpoints keep the same internal representation, pointing to # a remote location checkpoint = Checkpoint.from_uri("s3://some/bucket") self._testCheckpointSerde(checkpoint, *checkpoint.get_internal_representation())
def from_checkpoint( cls, checkpoint: Checkpoint, model: Optional[torch.nn.Module] = None ) -> "TorchPredictor": """Instantiate the predictor from a Checkpoint. The checkpoint is expected to be a result of ``TorchTrainer``. Args: checkpoint: The checkpoint to load the model and preprocessor from. It is expected to be from the result of a ``TorchTrainer`` run. model: If the checkpoint contains a model state dict, and not the model itself, then the state dict will be loaded to this ``model``. """ checkpoint_dict = checkpoint.to_dict() preprocessor = checkpoint_dict.get(PREPROCESSOR_KEY, None) if MODEL_KEY not in checkpoint_dict: raise RuntimeError( f"No item with key: {MODEL_KEY} is found in the " f"Checkpoint. Make sure this key exists when saving the " f"checkpoint in ``TorchTrainer``." ) model = load_torch_model( saved_model=checkpoint_dict[MODEL_KEY], model_definition=model ) return TorchPredictor(model=model, preprocessor=preprocessor)
def load_checkpoint( checkpoint: Checkpoint, env: Optional[EnvType] = None, ) -> Tuple[Policy, Optional[Preprocessor]]: """Load a Checkpoint from ``RLTrainer``. Args: checkpoint: The checkpoint to load the policy and preprocessor from. It is expected to be from the result of a ``RLTrainer`` run. env: Optional environment to instantiate the trainer with. If not given, it is parsed from the saved trainer configuration instead. Returns: The policy and AIR preprocessor contained within. """ with checkpoint.as_directory() as checkpoint_path: trainer_class_path = os.path.join(checkpoint_path, RL_TRAINER_CLASS_FILE) config_path = os.path.join(checkpoint_path, RL_CONFIG_FILE) if not os.path.exists(trainer_class_path): raise ValueError( f"RLPredictor only works with checkpoints created by " f"RLTrainer. The checkpoint you specified is missing the " f"`{RL_TRAINER_CLASS_FILE}` file." ) if not os.path.exists(config_path): raise ValueError( f"RLPredictor only works with checkpoints created by " f"RLTrainer. The checkpoint you specified is missing the " f"`{RL_CONFIG_FILE}` file." ) with open(trainer_class_path, "rb") as fp: trainer_cls = cpickle.load(fp) with open(config_path, "rb") as fp: config = cpickle.load(fp) checkpoint_data_path = None for file in os.listdir(checkpoint_path): if file.startswith("checkpoint") and not file.endswith(".tune_metadata"): checkpoint_data_path = os.path.join(checkpoint_path, file) if not checkpoint_data_path: raise ValueError( f"Could not find checkpoint data in RLlib checkpoint. " f"Found files: {list(os.listdir(checkpoint_path))}" ) preprocessor = load_preprocessor_from_dir(checkpoint_path) config.get("evaluation_config", {}).pop("in_evaluation", None) trainer = trainer_cls(config=config, env=env) trainer.restore(checkpoint_data_path) policy = trainer.get_policy() return policy, preprocessor
def load_checkpoint( checkpoint: Checkpoint, model: Union[Type[transformers.modeling_utils.PreTrainedModel], torch.nn.Module], tokenizer: Optional[Type[transformers.PreTrainedTokenizer]] = None, *, tokenizer_kwargs: Optional[Dict[str, Any]] = None, **pretrained_model_kwargs, ) -> Tuple[ Union[transformers.modeling_utils.PreTrainedModel, torch.nn.Module], transformers.training_args.TrainingArguments, Optional[transformers.PreTrainedTokenizer], Optional[Preprocessor], ]: """Load a Checkpoint from ``HuggingFaceTrainer``. Args: checkpoint: The checkpoint to load the model and preprocessor from. It is expected to be from the result of a ``HuggingFaceTrainer`` run. model: Either a ``transformers.PreTrainedModel`` class (eg. ``AutoModelForCausalLM``), or a PyTorch model to load the weights to. This should be the same model used for training. tokenizer: A ``transformers.PreTrainedTokenizer`` class to load the model tokenizer to. If not specified, the tokenizer will not be loaded. Will throw an exception if specified, but no tokenizer was found in the checkpoint. tokenizer_kwargs: Dict of kwargs to pass to ``tokenizer.from_pretrained`` call. Ignored if ``tokenizer`` is None. **pretrained_model_kwargs: Kwargs to pass to ``mode.from_pretrained`` call. Ignored if ``model`` is not a ``transformers.PreTrainedModel`` class. Returns: The model, ``TrainingArguments``, tokenizer and AIR preprocessor contained within. Those can be used to initialize a ``transformers.Trainer`` object locally. """ tokenizer_kwargs = tokenizer_kwargs or {} with checkpoint.as_directory() as checkpoint_path: preprocessor = load_preprocessor_from_dir(checkpoint_path) if isinstance(model, torch.nn.Module): state_dict = torch.load(os.path.join(checkpoint_path, WEIGHTS_NAME), map_location="cpu") model = load_torch_model(saved_model=state_dict, model_definition=model) else: model = model.from_pretrained(checkpoint_path, **pretrained_model_kwargs) if tokenizer: tokenizer = tokenizer.from_pretrained(checkpoint_path, **tokenizer_kwargs) training_args_path = os.path.join(checkpoint_path, TRAINING_ARGS_NAME) if os.path.exists(training_args_path): with open(training_args_path, "rb") as f: training_args = torch.load(f, map_location="cpu") else: training_args = None return model, training_args, tokenizer, preprocessor
def _trial_to_result(self, trial: Trial) -> Result: result = Result( checkpoint=Checkpoint.from_directory(trial.checkpoint.value) if trial.checkpoint.value else None, metrics=trial.last_result, error=self._populate_exception(trial), ) return result
def _prepare_dict_checkpoint(self) -> Checkpoint: # Create checkpoint from dict checkpoint = Checkpoint.from_dict(self.checkpoint_dict_data) self.assertIsInstance(checkpoint, Checkpoint) self.assertTrue(checkpoint._data_dict) self.assertEqual(checkpoint._data_dict["metric"], self.checkpoint_dict_data["metric"]) return checkpoint
def _prepare_fs_checkpoint(self) -> Checkpoint: # Create checkpoint from fs checkpoint = Checkpoint.from_directory(self.checkpoint_dir) self.assertIsInstance(checkpoint, Checkpoint) self.assertTrue(checkpoint._local_path, str) self.assertEqual(checkpoint._local_path, self.checkpoint_dir) return checkpoint
def test_predict_from_checkpoint_no_preprocessor(model): checkpoint = Checkpoint.from_dict({MODEL_KEY: model}) predictor = TorchPredictor.from_checkpoint(checkpoint) data_batch = np.array([[1], [2], [3]]) predictions = predictor.predict(data_batch) assert len(predictions) == 3 assert predictions.to_numpy().flatten().tolist() == [2, 4, 6]
def test_init(model, preprocessor): predictor = TorchPredictor(model=model, preprocessor=preprocessor) checkpoint = {MODEL_KEY: model, PREPROCESSOR_KEY: preprocessor} checkpoint_predictor = TorchPredictor.from_checkpoint( Checkpoint.from_dict(checkpoint)) assert checkpoint_predictor.model == predictor.model assert checkpoint_predictor.preprocessor == predictor.preprocessor
def test_predict_array(): checkpoint = {MODEL_KEY: weights} predictor = TensorflowPredictor.from_checkpoint( Checkpoint.from_dict(checkpoint), build_model) data_batch = np.array([[1], [2], [3]]) predictions = predictor.predict(data_batch) assert len(predictions) == 3 assert predictions.to_numpy().flatten().tolist() == [1, 2, 3]
def write_checkpoint(self, checkpoint: Dict): self.add_tune_checkpoint_id(checkpoint) # Add the preprocessor to the checkpoint. checkpoint[PREPROCESSOR_KEY] = self.preprocessor checkpoint_obj = Checkpoint.from_dict(checkpoint) # If inside a Tune Trainable, then checkpoint with Tune. with tune.checkpoint_dir(step=self._latest_checkpoint_id) as checkpoint_dir: checkpoint_obj.to_directory(path=checkpoint_dir)
def test_fs_delete_at_uri(self): """Test that clear bucket utility works""" checkpoint = self._prepare_fs_checkpoint() # Convert into dict checkpoint location = checkpoint.to_uri(self.cloud_uri) delete_at_uri(location) checkpoint = Checkpoint.from_uri(location) with self.assertRaises(FileNotFoundError): checkpoint.to_directory()
def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 predictor = LightGBMPredictor(model=model, preprocessor=preprocessor) with tempfile.TemporaryDirectory() as tmpdir: # This somewhat convoluted procedure is the same as in the # Trainers. The reason for saving model to disk instead # of directly to the dict as bytes is due to all callbacks # following save to disk logic. GBDT models are small # enough that IO should not be an issue. model.save_model(os.path.join(tmpdir, MODEL_KEY)) checkpoint = Checkpoint.from_dict({PREPROCESSOR_KEY: preprocessor}) checkpoint.to_directory(path=tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) checkpoint_predictor = LightGBMPredictor.from_checkpoint(checkpoint) assert get_num_trees(checkpoint_predictor.model) == get_num_trees(predictor.model) assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
def test_predict_no_preprocessor(): with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) checkpoint = Checkpoint.from_directory(tmpdir) predictor = SklearnPredictor.from_checkpoint(checkpoint) data_batch = np.array([[1, 2], [3, 4], [5, 6]]) predictions = predictor.predict(data_batch) assert len(predictions) == 3
def train_func(config, checkpoint_dir=None): # config already contains merged values. # Instantiate new Trainer in Trainable. trainer = trainer_cls(**config) if checkpoint_dir: trainer.resume_from_checkpoint = Checkpoint.from_directory( checkpoint_dir) trainer.setup() trainer.preprocess_datasets() trainer.training_loop()
def _trial_to_result(self, trial: Trial) -> Result: if trial.checkpoint.value: checkpoint_dir = TrainableUtil.find_checkpoint_dir(trial.checkpoint.value) checkpoint = Checkpoint.from_directory(checkpoint_dir) else: checkpoint = None result = Result( checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), ) return result