class TestBasic(DataParallelTrainer): _dataset_config = { "train": DatasetConfig(split=True, required=True), "test": DatasetConfig(), "baz": DatasetConfig(split=True), } def __init__( self, num_workers: int, expect_ds: bool, expect_sizes: Optional[dict], **kwargs ): def train_loop_per_worker(): data_shard = session.get_dataset_shard("train") if expect_ds: assert isinstance(data_shard, Dataset), data_shard else: assert isinstance(data_shard, DatasetPipeline), data_shard for k, v in expect_sizes.items(): shard = session.get_dataset_shard(k) if v == -1: assert shard is None, shard else: if isinstance(shard, DatasetPipeline): assert next(shard.iter_epochs()).count() == v, shard else: assert shard.count() == v, shard kwargs.pop("scaling_config", None) super().__init__( train_loop_per_worker=train_loop_per_worker, scaling_config=ScalingConfig(num_workers=num_workers), **kwargs, )
def get_datasets_and_configs( a=5, b=10, size=1000, split=0.8) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]: def get_dataset(a, b, size) -> Dataset: items = [i / size for i in range(size)] dataset = ray.data.from_items([{ "x": x, "y": a * x + b } for x in items]) return dataset dataset = get_dataset(a, b, size) train_dataset, validation_dataset = dataset.random_shuffle( ).split_proportionately([split]) datasets = { "train": train_dataset, "validation": validation_dataset, } # Use dataset pipelining dataset_configs = { "train": DatasetConfig(use_stream_api=True), "validation": DatasetConfig(use_stream_api=True), } return datasets, dataset_configs
def test_use_stream_api_config(ray_start_4_cpus): ds = ray.data.range(10) # Single worker basic case. test = TestBasic( 1, False, { "train": 10, "test": 10 }, dataset_config={"train": DatasetConfig(use_stream_api=True)}, datasets={ "train": ds, "test": ds }, ) test.fit() # Two worker split pipeline. test = TestBasic( 2, False, { "train": 5, "test": 10 }, dataset_config={"train": DatasetConfig(use_stream_api=True)}, datasets={ "train": ds, "test": ds }, ) test.fit()
def test_global_shuffle(ray_start_4_cpus): def checker(shard, results): assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert "Stage 1 read->randomize_block_order->random_shuffle" in stats, stats ds = ray.data.range_table(5) test = TestStream( checker, datasets={"train": ds}, dataset_config={"train": DatasetConfig(global_shuffle=True)}, ) test.fit() def checker(shard, results): assert len(results) == 5, results stats = shard.stats() assert "Stage 1 read->random_shuffle" in stats, stats ds = ray.data.range_table(5) test = TestBatch( checker, datasets={"train": ds}, dataset_config={"train": DatasetConfig(global_shuffle=True)}, ) test.fit()
def test_stream_finite_window_nocache_prep(ray_start_4_cpus): def rand(x): return [random.random() for _ in range(len(x))] prep = BatchMapper(rand) ds = ray.data.range_table(5) # Test the default 1GiB window size. def checker(shard, results): results = [sorted(r) for r in results] assert int(results[0][0]) != results[0][0] assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert ( "Stage 1 read->randomize_block_order->map_batches: 5/5 blocks executed " in stats ), stats test = TestStream( checker, preprocessor=prep, datasets={"train": ds}, dataset_config={"train": DatasetConfig()}, ) test.fit() # Test a smaller window size. def checker(shard, results): results = [sorted(r) for r in results] assert int(results[0][0]) != results[0][0] assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert ( "Stage 1 read->randomize_block_order->map_batches: 1/1 blocks executed " in stats ), stats test = TestStream( checker, preprocessor=prep, datasets={"train": ds}, dataset_config={"train": DatasetConfig(stream_window_size=10)}, ) test.fit()
def test_randomize_block_order(ray_start_4_cpus): def checker(shard, results): stats = shard.stats() assert "randomize_block_order: 5/5 blocks executed in 0s" in stats, stats ds = ray.data.range_table(5) test = TestStream( checker, datasets={"train": ds}, ) test.fit() def checker(shard, results): stats = shard.stats() assert "randomize_block_order" not in stats, stats ds = ray.data.range_table(5) test = TestStream( checker, datasets={"train": ds}, dataset_config={"train": DatasetConfig(randomize_block_order=False)}, ) test.fit() def checker(shard, results): assert len(results) == 5, results stats = shard.stats() assert "randomize_block_order: 5/5 blocks executed" in stats, stats ds = ray.data.range_table(5) test = TestBatch( checker, datasets={"train": ds}, ) test.fit()
def test_fit_transform_config(ray_start_4_cpus): ds = ray.data.range_table(10) def drop_odd(rows): key = list(rows)[0] return rows[(rows[key] % 2 == 0)] prep = BatchMapper(drop_odd) # Single worker basic case. test = TestBasic( 1, True, {"train": 5, "test": 5}, dataset_config={}, datasets={"train": ds, "test": ds}, preprocessor=prep, ) test.fit() # No transform for test. test = TestBasic( 1, True, {"train": 5, "test": 10}, dataset_config={"test": DatasetConfig(transform=False)}, datasets={"train": ds, "test": ds}, preprocessor=prep, ) test.fit()
def run_ingest_bulk(dataset, num_workers, num_cpus_per_worker): dummy_prep = BatchMapper(lambda df: df * 2) trainer = DummyTrainer( scaling_config=ScalingConfig( num_workers=num_workers, trainer_resources={"CPU": 0}, resources_per_worker={"CPU": num_cpus_per_worker}, _max_cpu_fraction_per_node=0.1, ), datasets={"train": dataset}, preprocessor=dummy_prep, num_epochs=1, prefetch_blocks=1, dataset_config={"train": DatasetConfig(split=True)}, ) trainer.fit()
class TestBatch(DataParallelTrainer): _dataset_config = { "train": DatasetConfig(split=True, required=True, use_stream_api=False), } def __init__(self, check_results_fn, **kwargs): def train_loop_per_worker(): data_shard = train.get_dataset_shard("train") assert isinstance(data_shard, Dataset), data_shard results = data_shard.take() check_results_fn(data_shard, results) super().__init__( train_loop_per_worker=train_loop_per_worker, scaling_config={"num_workers": 1}, **kwargs, )
def test_basic(ray_start_4_cpus): ds = ray.data.range_table(10) # Single worker basic case. test = TestBasic( 1, True, {"train": 10, "test": 10}, dataset_config={}, datasets={"train": ds, "test": ds}, ) test.fit() # Single worker, no test ds. test = TestBasic( 1, True, {"train": 10, "test": -1}, dataset_config={}, datasets={"train": ds} ) test.fit() # Two workers, train split. test = TestBasic( 2, True, {"train": 5, "test": 10}, datasets={"train": ds, "test": ds} ) test.fit() # Two workers, wild split. test = TestWildcard( 2, True, {"train": 5, "wild": 5}, datasets={"train": ds, "wild": ds} ) test.fit() # Two workers, both split. test = TestBasic( 2, True, {"train": 5, "test": 5}, dataset_config={"test": DatasetConfig(split=True)}, datasets={"train": ds, "test": ds}, ) # Test get config. assert test.get_dataset_config()["train"].split assert test.get_dataset_config()["test"].split test.fit()
class TestStream(DataParallelTrainer): _dataset_config = { "train": DatasetConfig(split=True, required=True, use_stream_api=True), } def __init__(self, check_results_fn, **kwargs): def train_loop_per_worker(): data_shard = train.get_dataset_shard("train") assert isinstance(data_shard, DatasetPipeline), data_shard results = [] for epoch in data_shard.iter_epochs(2): results.append(epoch.take()) check_results_fn(data_shard, results) super().__init__( train_loop_per_worker=train_loop_per_worker, scaling_config={"num_workers": 1}, **kwargs, )
def get_datasets_and_configs( a=5, b=10, size=1000) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]: def get_dataset(a, b, size) -> Dataset: items = [i / size for i in range(size)] dataset = ray.data.from_items([{ "x": x, "y": a * x + b } for x in items]) return dataset datasets = {"train": get_dataset(a, b, size)} # Use dataset pipelining dataset_configs = { "train": DatasetConfig(use_stream_api=True), } return datasets, dataset_configs
def test_stream_inf_window_cache_prep(ray_start_4_cpus): def checker(shard, results): results = [sorted(r) for r in results] assert len(results[0]) == 5, results assert results[0] == results[1], results stats = shard.stats() assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert "Stage 1 read->map_batches: 5/5 blocks executed " in stats, stats def rand(x): return [random.random() for _ in range(len(x))] prep = BatchMapper(rand) ds = ray.data.range_table(5) test = TestStream( checker, preprocessor=prep, datasets={"train": ds}, dataset_config={"train": DatasetConfig(stream_window_size=-1)}, ) test.fit()
class DataParallelTrainer(BaseTrainer): """A Trainer for data parallel training. You should subclass this Trainer if your Trainer follows SPMD (single program, multiple data) programming paradigm - you want multiple processes to run the same function, but on different data. This Trainer runs the function ``train_loop_per_worker`` on multiple Ray Actors. The ``train_loop_per_worker`` function is expected to take in either 0 or 1 arguments: .. code-block:: python def train_loop_per_worker(): ... .. code-block:: python def train_loop_per_worker(config: Dict): ... If ``train_loop_per_worker`` accepts an argument, then ``train_loop_config`` will be passed in as the argument. This is useful if you want to tune the values in ``train_loop_config`` as hyperparameters. If the ``datasets`` dict contains a training dataset (denoted by the "train" key), then it will be split into multiple dataset shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside ``train_loop_per_worker``. All the other datasets will not be split and ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset. Inside the ``train_loop_per_worker`` function, you can use any of the :ref:`Ray Train function utils <train-api-func-utils>`. .. code-block:: python def train_loop_per_worker(): # Report intermediate results for callbacks or logging. train.report(...) # Checkpoints the provided args as restorable state. train.save_checkpoint(...) # Returns dict of last saved checkpoint. train.load_checkpoint() # Returns the Ray Dataset shard for the given key. train.get_dataset_shard("my_dataset") # Returns the total number of workers executing training. train.get_world_size() # Returns the rank of this worker. train.get_world_rank() # Returns the rank of the worker on the current node. train.get_local_rank() **How do I use ``DataParallelTrainer`` or any of its subclasses?** Example: .. code-block:: python import ray from ray import train def train_loop_for_worker(): dataset_shard_for_this_worker = train.get_dataset_shard("train") assert len(dataset_shard_for_this_worker) == 1 train_dataset = ray.data.from_items([1, 2, 3]) assert len(train_dataset) == 3 trainer = DataParallelTrainer(scaling_config={"num_workers": 3}, datasets={"train": train_dataset}) result = trainer.fit() **How do I develop on top of ``DataParallelTrainer``?** In many cases, using DataParallelTrainer directly is sufficient to execute functions on multiple actors. However, you may want to subclass ``DataParallelTrainer`` and create a custom Trainer for the following 2 use cases: - **Use Case 1:** You want to do data parallel training, but want to have a predefined ``training_loop_per_worker``. - **Use Case 2:** You want to implement a custom :ref:`Training backend <train-api-backend-interfaces>` that automatically handles additional setup or teardown logic on each actor, so that the users of this new trainer do not have to implement this logic. For example, a ``TensorflowTrainer`` can be built on top of ``DataParallelTrainer`` that automatically handles setting the proper environment variables for distributed Tensorflow on each actor. For 1, you can set a predefined training loop in __init__ .. code-block:: python from ray.train.data_parallel_trainer import DataParallelTrainer class MyDataParallelTrainer(DataParallelTrainer): def __init__(self, *args, **kwargs): predefined_train_loop_per_worker = lambda: 1 super().__init__(predefined_train_loop_per_worker, *args, **kwargs) For 2, you can implement the ``ray.train.Backend`` and ``ray.train.BackendConfig`` interfaces. .. code-block:: python from dataclasses import dataclass from ray.train.backend import Backend, BackendConfig class MyBackend(Backend): def on_start(self, worker_group, backend_config): def set_env_var(env_var_value): import os os.environ["MY_ENV_VAR"] = env_var_value worker_group.execute(set_env_var, backend_config.env_var) @dataclass class MyBackendConfig(BackendConfig): env_var: str = "default_value" def backend_cls(self): return MyBackend class MyTrainer(DataParallelTrainer): def __init__(self, train_loop_per_worker, my_backend_config: MyBackendConfig, **kwargs): super().__init__( train_loop_per_worker, backend_config=my_backend_config, **kwargs) Args: train_loop_per_worker: The training function to execute. This can either take in no arguments or a ``config`` dict. train_loop_config: Configurations to pass into ``train_loop_per_worker`` if it accepts an argument. backend_config: Configuration for setting up a Backend (e.g. Torch, Tensorflow, Horovod) on each worker to enable distributed communication. If no Backend should be set up, then set this to None. scaling_config: Configuration for how to scale data parallel training. dataset_config: Configuration for dataset ingest. This is merged with the default dataset config for the given trainer (`cls._dataset_config`). run_config: Configuration for the execution of the training run. datasets: Any Ray Datasets to use for training. Use the key "train" to denote which dataset is the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if one is provided. preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. """ _checkpoint_manager_cls: Type[ TuneCheckpointManager ] = _DataParallelCheckpointManager _scaling_config_allowed_keys = BaseTrainer._scaling_config_allowed_keys + [ "num_workers", "resources_per_worker", "use_gpu", "placement_strategy", ] _dataset_config = { TRAIN_DATASET_KEY: DatasetConfig(fit=True, split=True), WILDCARD_KEY: DatasetConfig(split=False), } def __init__( self, train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], *, train_loop_config: Optional[Dict] = None, backend_config: Optional[BackendConfig] = None, scaling_config: Optional[ScalingConfig] = None, dataset_config: Optional[Dict[str, DatasetConfig]] = None, run_config: Optional[RunConfig] = None, datasets: Optional[Dict[str, GenDataset]] = None, preprocessor: Optional["Preprocessor"] = None, resume_from_checkpoint: Optional[Checkpoint] = None, ): if not ray.is_initialized(): ray.init() self._train_loop_per_worker = train_loop_per_worker self._train_loop_config = train_loop_config backend_config = ( backend_config if backend_config is not None else BackendConfig() ) self._backend_config = backend_config self._dataset_config = DatasetConfig.validated( DatasetConfig.merge(self._dataset_config, dataset_config), datasets ) self._ingest_spec = DataParallelIngestSpec( dataset_config=self._dataset_config, ) super(DataParallelTrainer, self).__init__( scaling_config=scaling_config, run_config=run_config, datasets=datasets, preprocessor=preprocessor, resume_from_checkpoint=resume_from_checkpoint, ) def _validate_attributes(self): super()._validate_attributes() if ( not self.scaling_config.get("use_gpu", False) and "GPU" in ray.available_resources() ): logger.info( "GPUs are detected in your Ray cluster, but GPU " "training is not enabled for this trainer. To enable " "GPU training, make sure to set `use_gpu` to True " "in your scaling config." ) if "num_workers" not in self.scaling_config: raise ValueError("You must specify the 'num_workers' in scaling_config.") if self.scaling_config["num_workers"] <= 0: raise ValueError( "'num_workers' in `scaling_config` must be a positive " f"integer. Received {self.scaling_config['num_workers']}" ) self._validate_train_loop_per_worker( self._train_loop_per_worker, "train_loop_per_worker" ) def preprocess_datasets(self) -> None: # Evaluate all datasets. self.datasets = {k: d() if callable(d) else d for k, d in self.datasets.items()} self.datasets = self._ingest_spec.preprocess_datasets( self.preprocessor, self.datasets ) def _validate_train_loop_per_worker( self, train_loop_per_worker: Callable, fn_name: str ) -> None: num_params = len(inspect.signature(train_loop_per_worker).parameters) if num_params > 1: raise ValueError( f"{fn_name} should take in 0 or 1 arguments, " f"but it accepts {num_params} arguments instead." ) def training_loop(self) -> None: scaling_config_dataclass = self._validate_and_get_scaling_config_data_class( self.scaling_config ) train_loop_per_worker = construct_train_func( self._train_loop_per_worker, self._train_loop_config, fn_arg_name="train_loop_per_worker", ) additional_resources_per_worker = ( scaling_config_dataclass.additional_resources_per_worker ) trial_info = TrialInfo( name=session.get_trial_name(), id=session.get_trial_id(), resources=session.get_trial_resources(), logdir=os.getcwd(), ) backend_executor = BackendExecutor( backend_config=self._backend_config, trial_info=trial_info, num_workers=scaling_config_dataclass.num_workers, num_cpus_per_worker=scaling_config_dataclass.num_cpus_per_worker, num_gpus_per_worker=scaling_config_dataclass.num_gpus_per_worker, additional_resources_per_worker=additional_resources_per_worker, max_retries=0, ) checkpoint_manager = self._checkpoint_manager_cls( preprocessor=self.preprocessor ) # Start the remote actors. backend_executor.start(initialization_hook=None) training_iterator = TrainingIterator( backend_executor=backend_executor, backend_config=self._backend_config, train_func=train_loop_per_worker, dataset_spec=self._ingest_spec, checkpoint_manager=checkpoint_manager, checkpoint=self.resume_from_checkpoint, checkpoint_strategy=None, ) for results in training_iterator: # TODO(ml-team): add ability to report results from multiple workers. first_worker_results = results[0] tune.report(**first_worker_results) # Shutdown workers. backend_executor.shutdown() def get_dataset_config(self) -> Dict[str, DatasetConfig]: """Return a copy of this Trainer's final dataset configs. Returns: The merged default + user-supplied dataset config. """ return self._dataset_config.copy()
class TestWildcard(TestBasic): _dataset_config = { "train": DatasetConfig(split=True, required=True), "*": DatasetConfig(split=True), }
class HuggingFaceTrainer(TorchTrainer): """A Trainer for data parallel HuggingFace Transformers on PyTorch training. This Trainer runs the ``transformers.Trainer.train()`` method on multiple Ray Actors. The training is carried out in a distributed fashion through PyTorch DDP. These actors already have the necessary torch process group already configured for distributed PyTorch training. The training function ran on every Actor will first run the specified ``trainer_init_per_worker`` function to obtain an instantiated ``transformers.Trainer`` object. The ``trainer_init_per_worker`` function will have access to preprocessed train and evaluation datsets. If the ``datasets`` dict contains a training dataset (denoted by the "train" key), then it will be split into multiple dataset shards, with each Actor training on a single shard. All the other datasets will not be split. Please note that if you use a custom ``transformers.Trainer`` subclass, the ``get_train_dataloader`` method will be wrapped around to disable sharding by ``transformers.IterableDatasetShard``, as the dataset will already be sharded on the Ray AIR side. HuggingFace loggers will be automatically disabled, and the ``local_rank`` argument in ``TrainingArguments`` will be automatically set. Please note that if you want to use CPU training, you will need to set the ``no_cuda`` argument in ``TrainingArguments`` manually - otherwise, an exception (segfault) may be thrown. Furthermore, 'steps' value for ``save_strategy``, ``logging_strategy`` and ``evaluation_strategy`` is not yet supported. This Trainer requires ``transformers>=4.19.0`` package. Example: .. code-block:: python # Based on # huggingface/notebooks/examples/language_modeling_from_scratch.ipynb # Hugging Face imports from datasets import load_dataset import transformers from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer import ray from ray.train.huggingface import HuggingFaceTrainer model_checkpoint = "gpt2" tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer" block_size = 128 datasets = load_dataset("wikitext", "wikitext-2-raw-v1") tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) def tokenize_function(examples): return tokenizer(examples["text"]) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=1, remove_columns=["text"] ) def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model # supported it. # instead of this drop, you can customize this part to your needs. total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [ t[i : i + block_size] for i in range(0, total_length, block_size) ] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result lm_datasets = tokenized_datasets.map( group_texts, batched=True, batch_size=1000, num_proc=1, ) ray_train_ds = ray.data.from_huggingface(lm_datasets["train"]) ray_evaluation_ds = ray.data.from_huggingface( lm_datasets["evaluation"] ) def trainer_init_per_worker(train_dataset, eval_dataset, **config): model_config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelForCausalLM.from_config(model_config) args = transformers.TrainingArguments( output_dir=f"{model_checkpoint}-wikitext2", evaluation_strategy="epoch", learning_rate=2e-5, weight_decay=0.01, ) return transformers.Trainer( model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) scaling_config = {"num_workers": 3} # If using GPUs, use the below scaling config instead. # scaling_config = {"num_workers": 3, "use_gpu": True} trainer = HuggingFaceTrainer( trainer_init_per_worker=trainer_init_per_worker, scaling_config=scaling_config, datasets={"train": ray_train_ds, "evaluation": ray_evaluation_ds}, ) result = trainer.fit() Args: trainer_init_per_worker: The function that returns an instantiated ``transformers.Trainer`` object and takes in the following arguments: train ``Torch.Dataset``, optional evaluation ``Torch.Dataset`` and config as kwargs. The Torch Datasets are automatically created by converting the Ray Datasets internally before they are passed into the function. datasets: Any Ray Datasets to use for training. Use the key "train" to denote which dataset is the training dataset and (optionally) key "evaluation" to denote the evaluation dataset. Can only contain a training dataset and up to one extra dataset to be used for evaluation. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if one is provided. trainer_init_config: Configurations to pass into ``trainer_init_per_worker`` as kwargs. torch_config: Configuration for setting up the PyTorch backend. If set to None, use the default configuration. This replaces the ``backend_config`` arg of ``DataParallelTrainer``. Same as in ``TorchTrainer``. scaling_config: Configuration for how to scale data parallel training. dataset_config: Configuration for dataset ingest. run_config: Configuration for the execution of the training run. preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. """ _checkpoint_manager_cls = _DataParallelSyncingCheckpointManager _dataset_config = { "train": DatasetConfig(fit=True, split=False, required=True), "evaluation": DatasetConfig(split=False), } def __init__( self, trainer_init_per_worker: Callable[ [TorchDataset, Optional[TorchDataset], Any], transformers.trainer.Trainer], *, datasets: Dict[str, GenDataset], trainer_init_config: Optional[Dict] = None, torch_config: Optional[TorchConfig] = None, scaling_config: Optional[ScalingConfig] = None, dataset_config: Optional[Dict[str, DatasetConfig]] = None, run_config: Optional[RunConfig] = None, preprocessor: Optional["Preprocessor"] = None, resume_from_checkpoint: Optional[Checkpoint] = None, ): # Functionality required for HuggingFaceTrainer only added in this # version if LooseVersion(transformers.__version__) < LooseVersion("4.19.0"): raise RuntimeError( "HuggingFaceTrainer requires transformers>=4.19.0, but you " f"have {transformers.__version__} which is incompatible. " "Update on all nodes with `pip install -U 'transformers>=4.19.0'`." ) self._validate_trainer_init_per_worker(trainer_init_per_worker, "trainer_init_per_worker") trainer_init_config = trainer_init_config.copy( ) if trainer_init_config else {} if "_trainer_init_per_worker" in trainer_init_config: raise ValueError( "'_trainer_init_per_worker' is a reserved key in `trainer_init_config`." ) trainer_init_config[ "_trainer_init_per_worker"] = trainer_init_per_worker super().__init__( train_loop_per_worker=_huggingface_train_loop_per_worker, train_loop_config=trainer_init_config, torch_config=torch_config, scaling_config=scaling_config, dataset_config=dataset_config, run_config=run_config, datasets=datasets, preprocessor=preprocessor, resume_from_checkpoint=resume_from_checkpoint, ) def _validate_trainer_init_per_worker(self, trainer_init_per_worker: Callable, fn_name: str) -> None: num_params = len(inspect.signature(trainer_init_per_worker).parameters) if num_params < 3: raise ValueError(f"{fn_name} should take in at least 3 arguments, " f"but it accepts {num_params} arguments instead.") def _validate_attributes(self): for key, conf in self._dataset_config.items(): if conf.use_stream_api: raise ValueError( "HuggingFaceTrainer does not support `use_stream_api`.") gpus_per_worker = self.scaling_config.get("num_gpus_per_worker", 0) if gpus_per_worker > 1: raise ValueError( f"You have assigned {gpus_per_worker} GPUs per worker. " "This is not supported by HuggingFace, which expects " "one GPU per worker in DDP mode and will fail " "if more are assigned.") if gpus_per_worker != int(gpus_per_worker): raise ValueError( f"You have assigned {gpus_per_worker} GPUs per worker, " "but fractional GPUs are not supported by HuggingFace.") super()._validate_attributes() def _convert_directory_checkpoint_to_sync_if_needed( self, checkpoint: Checkpoint) -> Checkpoint: """Replace the directory checkpoint with a node ip & path dict checkpoint. This dict checkpoint will be used to sync the directory. If we were to use a directory checkpoint directly, it would get deepcopied & serialized unnecessarily.""" with checkpoint.as_directory() as checkpoint_path: # Load checkpoint from path. checkpoint_path = Path(checkpoint_path).expanduser().absolute() if not checkpoint_path.joinpath(TUNE_CHECKPOINT_ID).exists(): # If the ID file is missing, we assume that this is already # a sync checkpoint dict_checkpoint = checkpoint.to_dict() if (NODE_IP_KEY not in dict_checkpoint or CHECKPOINT_PATH_ON_NODE_KEY not in dict_checkpoint): raise ValueError( "Wrong checkpoint format. Ensure the checkpoint is a " "result of `HuggingFaceTrainer`.") return checkpoint with open(checkpoint_path.joinpath(TUNE_CHECKPOINT_ID), "r") as f: tune_checkpoint_id = int(f.read()) return Checkpoint.from_dict({ NODE_IP_KEY: get_node_ip_address(), CHECKPOINT_PATH_ON_NODE_KEY: str(checkpoint_path), TUNE_CHECKPOINT_ID: tune_checkpoint_id, }) def setup(self) -> None: if self.resume_from_checkpoint: self.resume_from_checkpoint = ( self._convert_directory_checkpoint_to_sync_if_needed( self.resume_from_checkpoint)) def as_trainable(self) -> Type[Trainable]: original_param_dict = self._param_dict.copy() resume_from_checkpoint: Optional[Checkpoint] = self._param_dict.get( "resume_from_checkpoint", None) if resume_from_checkpoint: self._param_dict[ "resume_from_checkpoint"] = self._convert_directory_checkpoint_to_sync_if_needed( resume_from_checkpoint) try: ret = super().as_trainable() finally: self._param_dict = original_param_dict return ret
from ray.air.config import DatasetConfig train_ds = ray.data.range_tensor(1000) valid_ds = ray.data.range_tensor(100) test_ds = ray.data.range_tensor(100) my_trainer = TorchTrainer( lambda: None, # No-op training loop. scaling_config={"num_workers": 2}, datasets={ "train": train_ds, "valid": valid_ds, "test": test_ds, }, dataset_config={ "valid": DatasetConfig(split=True), "test": DatasetConfig(split=True), }, ) print(my_trainer.get_dataset_config()) # -> {'train': DatasetConfig(fit=True, split=True, ...), # 'valid': DatasetConfig(fit=False, split=True, ...), # 'test': DatasetConfig(fit=False, split=True, ...), ...} # __config_1_end__ # __config_2__ import ray from ray.train.torch import TorchTrainer from ray.air.config import DatasetConfig train_ds = ray.data.range_tensor(1000)
# An example preprocessor chain that just scales all values by 4.0 in two stages. preprocessor = Chain( BatchMapper(lambda df: df * 2), BatchMapper(lambda df: df * 2), ) # Setup the dummy trainer that prints ingest stats. # Run and print ingest stats. trainer = DummyTrainer( scaling_config={ "num_workers": 1, "use_gpu": False }, datasets={"train": dataset}, preprocessor=preprocessor, runtime_seconds=30, # Stop after this amount or time or 1 epoch is read. prefetch_blocks=1, # Number of blocks to prefetch when reading data. dataset_config={"valid": DatasetConfig(transform=False)}, batch_size=None, ) print("Dataset config", trainer.get_dataset_config()) trainer.fit() # Print memory stats (you can also use "ray memory --stats-only" to monitor this # during the middle of the run. try: print("Memory stats at end of ingest:\n\n{}".format( ray._private.internal_api.memory_summary(stats_only=True))) except Exception: print("Error getting Ray memory stats")
BatchMapper(lambda df: df * 2), BatchMapper(lambda df: df * 2), ) # Setup the dummy trainer that prints ingest stats. # Run and print ingest stats. trainer = DummyTrainer( scaling_config={ "num_workers": 1, "use_gpu": False }, datasets={"train": dataset}, preprocessor=preprocessor, num_epochs=args.num_epochs, prefetch_blocks=args.prefetch_blocks, dataset_config={ "train": DatasetConfig(use_stream_api=args.use_stream_api) }, batch_size=None, ) print("Dataset config", trainer.get_dataset_config()) trainer.fit() # Print memory stats (you can also use "ray memory --stats-only" to monitor this # during the middle of the run. try: print("Memory stats at end of ingest:\n\n{}".format( ray._private.internal_api.memory_summary(stats_only=True))) except Exception: print("Error getting Ray memory stats")
resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None trainer = TorchTrainer( train_func, train_loop_config=config, datasets=datasets, scaling_config=ScalingConfig( num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker, ), run_config=RunConfig(callbacks=callbacks), dataset_config={ "train": DatasetConfig(use_stream_api=True, stream_window_size=-1, global_shuffle=True) }, ) results = trainer.fit() state_dict = results.checkpoint.to_dict()["model"] def load_model_func(): num_layers = config["num_layers"] num_hidden = config["num_hidden"] dropout_every = config["dropout_every"] dropout_prob = config["dropout_prob"] num_features = config["num_features"] model = Net( n_layers=num_layers,