Exemplo n.º 1
0
def run_ingest_bulk(dataset, num_workers, num_cpus_per_worker):
    dummy_prep = BatchMapper(lambda df: df * 2)
    trainer = DummyTrainer(
        scaling_config=ScalingConfig(
            num_workers=num_workers,
            trainer_resources={"CPU": 0},
            resources_per_worker={"CPU": num_cpus_per_worker},
            _max_cpu_fraction_per_node=0.1,
        ),
        datasets={"train": dataset},
        preprocessor=dummy_prep,
        num_epochs=1,
        prefetch_blocks=1,
        dataset_config={"train": DatasetConfig(split=True)},
    )
    trainer.fit()
Exemplo n.º 2
0
def test_global_shuffle(ray_start_4_cpus):
    def checker(shard, results):
        assert len(results[0]) == 5, results
        assert results[0] != results[1], results
        stats = shard.stats()
        assert str(
            shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard
        assert "Stage 1 read->random_shuffle" in stats, stats

    ds = ray.data.range(5)
    test = TestStream(
        checker,
        datasets={"train": ds},
        dataset_config={"train": DatasetConfig(global_shuffle=True)},
    )
    test.fit()
Exemplo n.º 3
0
class TestBatch(DataParallelTrainer):
    _dataset_config = {
        "train": DatasetConfig(split=True, required=True, use_stream_api=False),
    }

    def __init__(self, check_results_fn, **kwargs):
        def train_loop_per_worker():
            data_shard = train.get_dataset_shard("train")
            assert isinstance(data_shard, Dataset), data_shard
            results = data_shard.take()
            check_results_fn(data_shard, results)

        super().__init__(
            train_loop_per_worker=train_loop_per_worker,
            scaling_config={"num_workers": 1},
            **kwargs,
        )
Exemplo n.º 4
0
def test_basic(ray_start_4_cpus):
    ds = ray.data.range_table(10)

    # Single worker basic case.
    test = TestBasic(
        1,
        True,
        {"train": 10, "test": 10},
        dataset_config={},
        datasets={"train": ds, "test": ds},
    )
    test.fit()

    # Single worker, no test ds.
    test = TestBasic(
        1, True, {"train": 10, "test": -1}, dataset_config={}, datasets={"train": ds}
    )
    test.fit()

    # Two workers, train split.
    test = TestBasic(
        2, True, {"train": 5, "test": 10}, datasets={"train": ds, "test": ds}
    )
    test.fit()

    # Two workers, wild split.
    test = TestWildcard(
        2, True, {"train": 5, "wild": 5}, datasets={"train": ds, "wild": ds}
    )
    test.fit()

    # Two workers, both split.
    test = TestBasic(
        2,
        True,
        {"train": 5, "test": 5},
        dataset_config={"test": DatasetConfig(split=True)},
        datasets={"train": ds, "test": ds},
    )
    # Test get config.
    assert test.get_dataset_config()["train"].split
    assert test.get_dataset_config()["test"].split
    test.fit()
Exemplo n.º 5
0
class TestStream(DataParallelTrainer):
    _dataset_config = {
        "train": DatasetConfig(split=True, required=True, use_stream_api=True),
    }

    def __init__(self, check_results_fn, **kwargs):
        def train_loop_per_worker():
            data_shard = train.get_dataset_shard("train")
            assert isinstance(data_shard, DatasetPipeline), data_shard
            results = []
            for epoch in data_shard.iter_epochs(2):
                results.append(epoch.take())
            check_results_fn(data_shard, results)

        super().__init__(
            train_loop_per_worker=train_loop_per_worker,
            scaling_config={"num_workers": 1},
            **kwargs,
        )
Exemplo n.º 6
0
def test_fit_transform_config(ray_start_4_cpus):
    ds = ray.data.range(10)

    def drop_odd(rows):
        key = list(rows)[0]
        return rows[(rows[key] % 2 == 0)]

    prep = BatchMapper(drop_odd)

    # Single worker basic case.
    test = TestBasic(
        1,
        True,
        {
            "train": 5,
            "test": 5
        },
        dataset_config={},
        datasets={
            "train": ds,
            "test": ds
        },
        preprocessor=prep,
    )
    test.fit()

    # No transform for test.
    test = TestBasic(
        1,
        True,
        {
            "train": 5,
            "test": 10
        },
        dataset_config={"test": DatasetConfig(transform=False)},
        datasets={
            "train": ds,
            "test": ds
        },
        preprocessor=prep,
    )
    test.fit()
Exemplo n.º 7
0
def get_datasets_and_configs(
        a=5,
        b=10,
        size=1000) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]:
    def get_dataset(a, b, size) -> Dataset:
        items = [i / size for i in range(size)]
        dataset = ray.data.from_items([{
            "x": x,
            "y": a * x + b
        } for x in items])
        return dataset

    datasets = {"train": get_dataset(a, b, size)}

    # Use dataset pipelining
    dataset_configs = {
        "train": DatasetConfig(use_stream_api=True),
    }

    return datasets, dataset_configs
Exemplo n.º 8
0
def test_stream_inf_window_cache_prep(ray_start_4_cpus):
    def checker(shard, results):
        results = [sorted(r) for r in results]
        assert len(results[0]) == 5, results
        assert results[0] == results[1], results
        stats = shard.stats()
        assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard
        assert "Stage 1 read->map_batches: 5/5 blocks executed " in stats, stats

    def rand(x):
        return [random.random() for _ in range(len(x))]

    prep = BatchMapper(rand)
    ds = ray.data.range_table(5)
    test = TestStream(
        checker,
        preprocessor=prep,
        datasets={"train": ds},
        dataset_config={"train": DatasetConfig(stream_window_size=-1)},
    )
    test.fit()
Exemplo n.º 9
0
        BatchMapper(lambda df: df * 2),
        BatchMapper(lambda df: df * 2),
    )

    # Setup the dummy trainer that prints ingest stats.
    # Run and print ingest stats.
    trainer = DummyTrainer(
        scaling_config={
            "num_workers": 1,
            "use_gpu": False
        },
        datasets={"train": dataset},
        preprocessor=preprocessor,
        num_epochs=args.num_epochs,
        prefetch_blocks=args.prefetch_blocks,
        dataset_config={
            "train": DatasetConfig(use_stream_api=args.use_stream_api)
        },
        batch_size=None,
    )
    print("Dataset config", trainer.get_dataset_config())
    trainer.fit()

    # Print memory stats (you can also use "ray memory --stats-only" to monitor this
    # during the middle of the run.
    try:
        print("Memory stats at end of ingest:\n\n{}".format(
            ray._private.internal_api.memory_summary(stats_only=True)))
    except Exception:
        print("Error getting Ray memory stats")
Exemplo n.º 10
0
class HuggingFaceTrainer(TorchTrainer):
    """A Trainer for data parallel HuggingFace Transformers on PyTorch training.

    This Trainer runs the ``transformers.Trainer.train()`` method on multiple
    Ray Actors. The training is carried out in a distributed fashion through PyTorch
    DDP. These actors already have the necessary torch process group already
    configured for distributed PyTorch training.

    The training function ran on every Actor will first run the
    specified ``trainer_init_per_worker`` function to obtain an instantiated
    ``transformers.Trainer`` object. The ``trainer_init_per_worker`` function
    will have access to preprocessed train and evaluation datsets.

    If the ``datasets`` dict contains a training dataset (denoted by
    the "train" key), then it will be split into multiple dataset
    shards, with each Actor training on a single shard.
    All the other datasets will not be split.

    Please note that if you use a custom ``transformers.Trainer`` subclass,
    the ``get_train_dataloader`` method will be wrapped around to disable
    sharding by ``transformers.IterableDatasetShard``, as the dataset will
    already be sharded on the Ray AIR side.

    HuggingFace loggers will be automatically disabled, and the ``local_rank``
    argument in ``TrainingArguments`` will be automatically set. Please note
    that if you want to use CPU training, you will need to set the ``no_cuda``
    argument in ``TrainingArguments`` manually - otherwise, an exception
    (segfault) may be thrown. Furthermore, 'steps' value for ``save_strategy``,
    ``logging_strategy`` and ``evaluation_strategy`` is not yet supported.

    This Trainer requires ``transformers>=4.19.0`` package.

    Example:
        .. code-block:: python

            # Based on
            # huggingface/notebooks/examples/language_modeling_from_scratch.ipynb

            # Hugging Face imports
            from datasets import load_dataset
            import transformers
            from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

            import ray
            from ray.train.huggingface import HuggingFaceTrainer

            model_checkpoint = "gpt2"
            tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer"
            block_size = 128

            datasets = load_dataset("wikitext", "wikitext-2-raw-v1")
            tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)

            def tokenize_function(examples):
                return tokenizer(examples["text"])

            tokenized_datasets = datasets.map(
                tokenize_function, batched=True, num_proc=1, remove_columns=["text"]
            )

            def group_texts(examples):
                # Concatenate all texts.
                concatenated_examples = {
                    k: sum(examples[k], []) for k in examples.keys()
                }
                total_length = len(concatenated_examples[list(examples.keys())[0]])
                # We drop the small remainder, we could add padding if the model
                # supported it.
                # instead of this drop, you can customize this part to your needs.
                total_length = (total_length // block_size) * block_size
                # Split by chunks of max_len.
                result = {
                    k: [
                        t[i : i + block_size]
                        for i in range(0, total_length, block_size)
                    ]
                    for k, t in concatenated_examples.items()
                }
                result["labels"] = result["input_ids"].copy()
                return result

            lm_datasets = tokenized_datasets.map(
                group_texts,
                batched=True,
                batch_size=1000,
                num_proc=1,
            )
            ray_train_ds = ray.data.from_huggingface(lm_datasets["train"])
            ray_evaluation_ds = ray.data.from_huggingface(
                lm_datasets["evaluation"]
            )

            def trainer_init_per_worker(train_dataset, eval_dataset, **config):
                model_config = AutoConfig.from_pretrained(model_checkpoint)
                model = AutoModelForCausalLM.from_config(model_config)
                args = transformers.TrainingArguments(
                    output_dir=f"{model_checkpoint}-wikitext2",
                    evaluation_strategy="epoch",
                    learning_rate=2e-5,
                    weight_decay=0.01,
                )
                return transformers.Trainer(
                    model=model,
                    args=args,
                    train_dataset=train_dataset,
                    eval_dataset=eval_dataset,
                )

            scaling_config = {"num_workers": 3}
            # If using GPUs, use the below scaling config instead.
            # scaling_config = {"num_workers": 3, "use_gpu": True}
            trainer = HuggingFaceTrainer(
                trainer_init_per_worker=trainer_init_per_worker,
                scaling_config=scaling_config,
                datasets={"train": ray_train_ds, "evaluation": ray_evaluation_ds},
            )
            result = trainer.fit()

    Args:
        trainer_init_per_worker: The function that returns an instantiated
            ``transformers.Trainer`` object and takes in the following arguments:
            train ``Torch.Dataset``, optional evaluation ``Torch.Dataset``
            and config as kwargs. The Torch Datasets are automatically
            created by converting the Ray Datasets internally before
            they are passed into the function.
        datasets: Any Ray Datasets to use for training. Use
            the key "train" to denote which dataset is the training
            dataset and (optionally) key "evaluation" to denote the evaluation
            dataset. Can only contain a training dataset
            and up to one extra dataset to be used for evaluation.
            If a ``preprocessor`` is provided and has not already been fit,
            it will be fit on the training dataset. All datasets will be
            transformed by the ``preprocessor`` if one is provided.
        trainer_init_config: Configurations to pass into
            ``trainer_init_per_worker`` as kwargs.
        torch_config: Configuration for setting up the PyTorch backend. If set to
            None, use the default configuration. This replaces the ``backend_config``
            arg of ``DataParallelTrainer``. Same as in ``TorchTrainer``.
        scaling_config: Configuration for how to scale data parallel training.
        dataset_config: Configuration for dataset ingest.
        run_config: Configuration for the execution of the training run.
        preprocessor: A ray.data.Preprocessor to preprocess the
            provided datasets.
        resume_from_checkpoint: A checkpoint to resume training from.
    """

    _checkpoint_manager_cls = _DataParallelSyncingCheckpointManager

    _dataset_config = {
        "train": DatasetConfig(fit=True, split=False, required=True),
        "evaluation": DatasetConfig(split=False),
    }

    def __init__(
        self,
        trainer_init_per_worker: Callable[
            [TorchDataset, Optional[TorchDataset], Any],
            transformers.trainer.Trainer],
        *,
        datasets: Dict[str, GenDataset],
        trainer_init_config: Optional[Dict] = None,
        torch_config: Optional[TorchConfig] = None,
        scaling_config: Optional[ScalingConfig] = None,
        dataset_config: Optional[Dict[str, DatasetConfig]] = None,
        run_config: Optional[RunConfig] = None,
        preprocessor: Optional["Preprocessor"] = None,
        resume_from_checkpoint: Optional[Checkpoint] = None,
    ):

        # Functionality required for HuggingFaceTrainer only added in this
        # version
        if LooseVersion(transformers.__version__) < LooseVersion("4.19.0"):
            raise RuntimeError(
                "HuggingFaceTrainer requires transformers>=4.19.0, but you "
                f"have {transformers.__version__} which is incompatible. "
                "Update on all nodes with `pip install -U 'transformers>=4.19.0'`."
            )

        self._validate_trainer_init_per_worker(trainer_init_per_worker,
                                               "trainer_init_per_worker")

        trainer_init_config = trainer_init_config.copy(
        ) if trainer_init_config else {}
        if "_trainer_init_per_worker" in trainer_init_config:
            raise ValueError(
                "'_trainer_init_per_worker' is a reserved key in `trainer_init_config`."
            )
        trainer_init_config[
            "_trainer_init_per_worker"] = trainer_init_per_worker

        super().__init__(
            train_loop_per_worker=_huggingface_train_loop_per_worker,
            train_loop_config=trainer_init_config,
            torch_config=torch_config,
            scaling_config=scaling_config,
            dataset_config=dataset_config,
            run_config=run_config,
            datasets=datasets,
            preprocessor=preprocessor,
            resume_from_checkpoint=resume_from_checkpoint,
        )

    def _validate_trainer_init_per_worker(self,
                                          trainer_init_per_worker: Callable,
                                          fn_name: str) -> None:
        num_params = len(inspect.signature(trainer_init_per_worker).parameters)
        if num_params < 3:
            raise ValueError(f"{fn_name} should take in at least 3 arguments, "
                             f"but it accepts {num_params} arguments instead.")

    def _validate_attributes(self):
        for key, conf in self._dataset_config.items():
            if conf.use_stream_api:
                raise ValueError(
                    "HuggingFaceTrainer does not support `use_stream_api`.")
        gpus_per_worker = self.scaling_config.get("num_gpus_per_worker", 0)
        if gpus_per_worker > 1:
            raise ValueError(
                f"You have assigned {gpus_per_worker} GPUs per worker. "
                "This is not supported by HuggingFace, which expects "
                "one GPU per worker in DDP mode and will fail "
                "if more are assigned.")
        if gpus_per_worker != int(gpus_per_worker):
            raise ValueError(
                f"You have assigned {gpus_per_worker} GPUs per worker, "
                "but fractional GPUs are not supported by HuggingFace.")

        super()._validate_attributes()

    def _convert_directory_checkpoint_to_sync_if_needed(
            self, checkpoint: Checkpoint) -> Checkpoint:
        """Replace the directory checkpoint with a node ip & path dict checkpoint.

        This dict checkpoint will be used to sync the directory.
        If we were to use a directory checkpoint directly, it would get deepcopied &
        serialized unnecessarily."""
        with checkpoint.as_directory() as checkpoint_path:
            # Load checkpoint from path.
            checkpoint_path = Path(checkpoint_path).expanduser().absolute()
            if not checkpoint_path.joinpath(TUNE_CHECKPOINT_ID).exists():
                # If the ID file is missing, we assume that this is already
                # a sync checkpoint
                dict_checkpoint = checkpoint.to_dict()
                if (NODE_IP_KEY not in dict_checkpoint
                        or CHECKPOINT_PATH_ON_NODE_KEY not in dict_checkpoint):
                    raise ValueError(
                        "Wrong checkpoint format. Ensure the checkpoint is a "
                        "result of `HuggingFaceTrainer`.")
                return checkpoint
            with open(checkpoint_path.joinpath(TUNE_CHECKPOINT_ID), "r") as f:
                tune_checkpoint_id = int(f.read())

            return Checkpoint.from_dict({
                NODE_IP_KEY:
                get_node_ip_address(),
                CHECKPOINT_PATH_ON_NODE_KEY:
                str(checkpoint_path),
                TUNE_CHECKPOINT_ID:
                tune_checkpoint_id,
            })

    def setup(self) -> None:
        if self.resume_from_checkpoint:
            self.resume_from_checkpoint = (
                self._convert_directory_checkpoint_to_sync_if_needed(
                    self.resume_from_checkpoint))

    def as_trainable(self) -> Type[Trainable]:
        original_param_dict = self._param_dict.copy()
        resume_from_checkpoint: Optional[Checkpoint] = self._param_dict.get(
            "resume_from_checkpoint", None)
        if resume_from_checkpoint:
            self._param_dict[
                "resume_from_checkpoint"] = self._convert_directory_checkpoint_to_sync_if_needed(
                    resume_from_checkpoint)
        try:
            ret = super().as_trainable()
        finally:
            self._param_dict = original_param_dict
        return ret
Exemplo n.º 11
0
class DataParallelTrainer(BaseTrainer):
    """A Trainer for data parallel training.

    You should subclass this Trainer if your Trainer follows SPMD (single program,
    multiple data) programming paradigm - you want multiple processes to run the same
    function, but on different data.

    This Trainer runs the function ``train_loop_per_worker`` on multiple Ray
    Actors.

    The ``train_loop_per_worker`` function is expected to take in either 0 or 1
    arguments:

    .. code-block:: python

        def train_loop_per_worker():
            ...

    .. code-block:: python

        def train_loop_per_worker(config: Dict):
            ...

    If ``train_loop_per_worker`` accepts an argument, then
    ``train_loop_config`` will be passed in as the argument. This is useful if you
    want to tune the values in ``train_loop_config`` as hyperparameters.

    If the ``datasets`` dict contains a training dataset (denoted by
    the "train" key), then it will be split into multiple dataset
    shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside
    ``train_loop_per_worker``. All the other datasets will not be split and
    ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset.

    Inside the ``train_loop_per_worker`` function, you can use any of the
    :ref:`Ray Train function utils <train-api-func-utils>`.

    .. code-block:: python

        def train_loop_per_worker():
            # Report intermediate results for callbacks or logging.
            train.report(...)

            # Checkpoints the provided args as restorable state.
            train.save_checkpoint(...)

            # Returns dict of last saved checkpoint.
            train.load_checkpoint()

            # Returns the Ray Dataset shard for the given key.
            train.get_dataset_shard("my_dataset")

            # Returns the total number of workers executing training.
            train.get_world_size()

            # Returns the rank of this worker.
            train.get_world_rank()

            # Returns the rank of the worker on the current node.
            train.get_local_rank()

    **How do I use ``DataParallelTrainer`` or any of its subclasses?**

    Example:

    .. code-block:: python

        import ray
        from ray import train

        def train_loop_for_worker():
            dataset_shard_for_this_worker = train.get_dataset_shard("train")

            assert len(dataset_shard_for_this_worker) == 1

        train_dataset = ray.data.from_items([1, 2, 3])
        assert len(train_dataset) == 3
        trainer = DataParallelTrainer(scaling_config={"num_workers": 3},
            datasets={"train": train_dataset})
        result = trainer.fit()

    **How do I develop on top of ``DataParallelTrainer``?**

    In many cases, using DataParallelTrainer directly is sufficient to execute
    functions on multiple actors.

    However, you may want to subclass ``DataParallelTrainer`` and create a custom
    Trainer for the following 2 use cases:

      - **Use Case 1:** You want to do data parallel training, but want to have
        a predefined ``training_loop_per_worker``.

      - **Use Case 2:** You want to implement a custom :ref:`Training backend
        <train-api-backend-interfaces>` that automatically handles
        additional setup or teardown logic on each actor, so that the users of this
        new trainer do not have to implement this logic. For example, a
        ``TensorflowTrainer`` can be built on top of ``DataParallelTrainer``
        that automatically handles setting the proper environment variables for
        distributed Tensorflow on each actor.

    For 1, you can set a predefined training loop in __init__

    .. code-block:: python

        from ray.train.data_parallel_trainer import DataParallelTrainer

        class MyDataParallelTrainer(DataParallelTrainer):
            def __init__(self, *args, **kwargs):
                predefined_train_loop_per_worker = lambda: 1
                super().__init__(predefined_train_loop_per_worker, *args, **kwargs)


    For 2, you can implement the ``ray.train.Backend`` and ``ray.train.BackendConfig``
    interfaces.

    .. code-block:: python

        from dataclasses import dataclass
        from ray.train.backend import Backend, BackendConfig

        class MyBackend(Backend):
            def on_start(self, worker_group, backend_config):
                def set_env_var(env_var_value):
                    import os
                    os.environ["MY_ENV_VAR"] = env_var_value

                worker_group.execute(set_env_var, backend_config.env_var)

        @dataclass
        class MyBackendConfig(BackendConfig):
            env_var: str = "default_value"

            def backend_cls(self):
                return MyBackend

        class MyTrainer(DataParallelTrainer):
            def __init__(self, train_loop_per_worker, my_backend_config:
                MyBackendConfig, **kwargs):

                super().__init__(
                    train_loop_per_worker,
                    backend_config=my_backend_config, **kwargs)

    Args:
        train_loop_per_worker: The training function to execute.
            This can either take in no arguments or a ``config`` dict.
        train_loop_config: Configurations to pass into
            ``train_loop_per_worker`` if it accepts an argument.
        backend_config: Configuration for setting up a Backend (e.g. Torch,
            Tensorflow, Horovod) on each worker to enable distributed
            communication. If no Backend should be set up, then set this to None.
        scaling_config: Configuration for how to scale data parallel training.
        dataset_config: Configuration for dataset ingest. This is merged with the
            default dataset config for the given trainer (`cls._dataset_config`).
        run_config: Configuration for the execution of the training run.
        datasets: Any Ray Datasets to use for training. Use
            the key "train" to denote which dataset is the training
            dataset. If a ``preprocessor`` is provided and has not already been fit,
            it will be fit on the training dataset. All datasets will be transformed
            by the ``preprocessor`` if one is provided.
        preprocessor: A ray.data.Preprocessor to preprocess the
            provided datasets.
        resume_from_checkpoint: A checkpoint to resume training from.
    """

    _checkpoint_manager_cls: Type[
        TuneCheckpointManager
    ] = _DataParallelCheckpointManager

    _scaling_config_allowed_keys = BaseTrainer._scaling_config_allowed_keys + [
        "num_workers",
        "resources_per_worker",
        "use_gpu",
        "placement_strategy",
    ]

    _dataset_config = {
        TRAIN_DATASET_KEY: DatasetConfig(fit=True, split=True),
        WILDCARD_KEY: DatasetConfig(split=False),
    }

    def __init__(
        self,
        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
        *,
        train_loop_config: Optional[Dict] = None,
        backend_config: Optional[BackendConfig] = None,
        scaling_config: Optional[ScalingConfig] = None,
        dataset_config: Optional[Dict[str, DatasetConfig]] = None,
        run_config: Optional[RunConfig] = None,
        datasets: Optional[Dict[str, GenDataset]] = None,
        preprocessor: Optional["Preprocessor"] = None,
        resume_from_checkpoint: Optional[Checkpoint] = None,
    ):
        if not ray.is_initialized():
            ray.init()

        self._train_loop_per_worker = train_loop_per_worker
        self._train_loop_config = train_loop_config

        backend_config = (
            backend_config if backend_config is not None else BackendConfig()
        )
        self._backend_config = backend_config
        self._dataset_config = DatasetConfig.validated(
            DatasetConfig.merge(self._dataset_config, dataset_config), datasets
        )
        self._ingest_spec = DataParallelIngestSpec(
            dataset_config=self._dataset_config,
        )

        super(DataParallelTrainer, self).__init__(
            scaling_config=scaling_config,
            run_config=run_config,
            datasets=datasets,
            preprocessor=preprocessor,
            resume_from_checkpoint=resume_from_checkpoint,
        )

    def _validate_attributes(self):
        super()._validate_attributes()

        if (
            not self.scaling_config.get("use_gpu", False)
            and "GPU" in ray.available_resources()
        ):
            logger.info(
                "GPUs are detected in your Ray cluster, but GPU "
                "training is not enabled for this trainer. To enable "
                "GPU training, make sure to set `use_gpu` to True "
                "in your scaling config."
            )

        if "num_workers" not in self.scaling_config:
            raise ValueError("You must specify the 'num_workers' in scaling_config.")

        if self.scaling_config["num_workers"] <= 0:
            raise ValueError(
                "'num_workers' in `scaling_config` must be a positive "
                f"integer. Received {self.scaling_config['num_workers']}"
            )

        self._validate_train_loop_per_worker(
            self._train_loop_per_worker, "train_loop_per_worker"
        )

    def preprocess_datasets(self) -> None:
        # Evaluate all datasets.
        self.datasets = {k: d() if callable(d) else d for k, d in self.datasets.items()}
        self.datasets = self._ingest_spec.preprocess_datasets(
            self.preprocessor, self.datasets
        )

    def _validate_train_loop_per_worker(
        self, train_loop_per_worker: Callable, fn_name: str
    ) -> None:
        num_params = len(inspect.signature(train_loop_per_worker).parameters)
        if num_params > 1:
            raise ValueError(
                f"{fn_name} should take in 0 or 1 arguments, "
                f"but it accepts {num_params} arguments instead."
            )

    def training_loop(self) -> None:
        scaling_config_dataclass = self._validate_and_get_scaling_config_data_class(
            self.scaling_config
        )

        train_loop_per_worker = construct_train_func(
            self._train_loop_per_worker,
            self._train_loop_config,
            fn_arg_name="train_loop_per_worker",
        )

        additional_resources_per_worker = (
            scaling_config_dataclass.additional_resources_per_worker
        )

        trial_info = TrialInfo(
            name=session.get_trial_name(),
            id=session.get_trial_id(),
            resources=session.get_trial_resources(),
            logdir=os.getcwd(),
        )

        backend_executor = BackendExecutor(
            backend_config=self._backend_config,
            trial_info=trial_info,
            num_workers=scaling_config_dataclass.num_workers,
            num_cpus_per_worker=scaling_config_dataclass.num_cpus_per_worker,
            num_gpus_per_worker=scaling_config_dataclass.num_gpus_per_worker,
            additional_resources_per_worker=additional_resources_per_worker,
            max_retries=0,
        )

        checkpoint_manager = self._checkpoint_manager_cls(
            preprocessor=self.preprocessor
        )

        # Start the remote actors.
        backend_executor.start(initialization_hook=None)

        training_iterator = TrainingIterator(
            backend_executor=backend_executor,
            backend_config=self._backend_config,
            train_func=train_loop_per_worker,
            dataset_spec=self._ingest_spec,
            checkpoint_manager=checkpoint_manager,
            checkpoint=self.resume_from_checkpoint,
            checkpoint_strategy=None,
        )

        for results in training_iterator:
            # TODO(ml-team): add ability to report results from multiple workers.
            first_worker_results = results[0]

            tune.report(**first_worker_results)

        # Shutdown workers.
        backend_executor.shutdown()

    def get_dataset_config(self) -> Dict[str, DatasetConfig]:
        """Return a copy of this Trainer's final dataset configs.

        Returns:
            The merged default + user-supplied dataset config.
        """
        return self._dataset_config.copy()
Exemplo n.º 12
0
class TestWildcard(TestBasic):
    _dataset_config = {
        "train": DatasetConfig(split=True, required=True),
        "*": DatasetConfig(split=True),
    }
Exemplo n.º 13
0
from ray.air.config import DatasetConfig

train_ds = ray.data.range_tensor(1000)
valid_ds = ray.data.range_tensor(100)
test_ds = ray.data.range_tensor(100)

my_trainer = TorchTrainer(
    lambda: None,  # No-op training loop.
    scaling_config={"num_workers": 2},
    datasets={
        "train": train_ds,
        "valid": valid_ds,
        "test": test_ds,
    },
    dataset_config={
        "valid": DatasetConfig(split=True),
        "test": DatasetConfig(split=True),
    },
)
print(my_trainer.get_dataset_config())
# -> {'train': DatasetConfig(fit=True, split=True, ...),
#     'valid': DatasetConfig(fit=False, split=True, ...),
#     'test': DatasetConfig(fit=False, split=True, ...), ...}
# __config_1_end__

# __config_2__
import ray
from ray.train.torch import TorchTrainer
from ray.air.config import DatasetConfig

train_ds = ray.data.range_tensor(1000)
Exemplo n.º 14
0
    # An example preprocessor chain that just scales all values by 4.0 in two stages.
    preprocessor = Chain(
        BatchMapper(lambda df: df * 2),
        BatchMapper(lambda df: df * 2),
    )

    # Setup the dummy trainer that prints ingest stats.
    # Run and print ingest stats.
    trainer = DummyTrainer(
        scaling_config={
            "num_workers": 1,
            "use_gpu": False
        },
        datasets={"train": dataset},
        preprocessor=preprocessor,
        runtime_seconds=30,  # Stop after this amount or time or 1 epoch is read.
        prefetch_blocks=1,  # Number of blocks to prefetch when reading data.
        dataset_config={"valid": DatasetConfig(transform=False)},
        batch_size=None,
    )
    print("Dataset config", trainer.get_dataset_config())
    trainer.fit()

    # Print memory stats (you can also use "ray memory --stats-only" to monitor this
    # during the middle of the run.
    try:
        print("Memory stats at end of ingest:\n\n{}".format(
            ray._private.internal_api.memory_summary(stats_only=True)))
    except Exception:
        print("Error getting Ray memory stats")
Exemplo n.º 15
0
    resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None

    trainer = TorchTrainer(
        train_func,
        train_loop_config=config,
        datasets=datasets,
        scaling_config=ScalingConfig(
            num_workers=num_workers,
            use_gpu=use_gpu,
            resources_per_worker=resources_per_worker,
        ),
        run_config=RunConfig(callbacks=callbacks),
        dataset_config={
            "train":
            DatasetConfig(use_stream_api=True,
                          stream_window_size=-1,
                          global_shuffle=True)
        },
    )
    results = trainer.fit()
    state_dict = results.checkpoint.to_dict()["model"]

    def load_model_func():
        num_layers = config["num_layers"]
        num_hidden = config["num_hidden"]
        dropout_every = config["dropout_every"]
        dropout_prob = config["dropout_prob"]
        num_features = config["num_features"]

        model = Net(
            n_layers=num_layers,