예제 #1
0
    def __init__(
        self,
        *,
        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict],
                                                                  None]],
        train_loop_config: Optional[Dict] = None,
        backend_config: Optional[BackendConfig] = None,
        scaling_config: Optional[ScalingConfig] = None,
        run_config: Optional[RunConfig] = None,
        datasets: Optional[Dict[str, GenDataset]] = None,
        preprocessor: Optional[Preprocessor] = None,
        resume_from_checkpoint: Optional[Checkpoint] = None,
    ):
        if not ray.is_initialized():
            ray.init()

        self.train_loop_per_worker = train_loop_per_worker
        self.train_loop_config = train_loop_config

        backend_config = (backend_config
                          if backend_config is not None else BackendConfig())
        self.backend_config = backend_config

        super(DataParallelTrainer, self).__init__(
            scaling_config=scaling_config,
            run_config=run_config,
            datasets=datasets,
            preprocessor=preprocessor,
            resume_from_checkpoint=resume_from_checkpoint,
        )
예제 #2
0
    def __init__(
        self,
        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
        train_loop_config: Optional[Dict] = None,
        backend_config: Optional[BackendConfig] = None,
        scaling_config: Optional[ScalingConfig] = None,
        run_config: Optional[RunConfig] = None,
        datasets: Optional[Dict[str, GenDataset]] = None,
        preprocessor: Optional[Preprocessor] = None,
        resume_from_checkpoint: Optional[Checkpoint] = None,
    ):
        if not ray.is_initialized():
            ray.init()

        self.train_loop_per_worker = train_loop_per_worker
        self.train_loop_config = train_loop_config

        super(DataParallelTrainer, self).__init__(
            scaling_config=scaling_config,
            run_config=run_config,
            datasets=datasets,
            preprocessor=preprocessor,
            resume_from_checkpoint=resume_from_checkpoint,
        )

        if (
            not self.scaling_config.get("use_gpu", False)
            and "GPU" in ray.available_resources()
        ):
            logger.info(
                "GPUs are detected in your Ray cluster, but GPU "
                "training is not enabled for this trainer. To enable "
                "GPU training, make sure to set `use_gpu` to True "
                "in your scaling config."
            )

        if "num_workers" not in self.scaling_config:
            raise ValueError("You must specify the 'num_workers' in scaling_config.")

        if self.scaling_config["num_workers"] <= 0:
            raise ValueError(
                "'num_workers' in `scaling_config` must be a positive "
                f"integer. Received {self.scaling_config['num_workers']}"
            )

        num_params = len(inspect.signature(self.train_loop_per_worker).parameters)
        if num_params > 1:
            raise ValueError(
                f"train_loop_per_worker should take in 0 or 1 arguments, "
                f"but it accepts {num_params} arguments instead."
            )

        backend_config = backend_config if backend_config else BackendConfig()
        self.backend_config = backend_config
예제 #3
0
    def __init__(
        self,
        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]],
        *,
        train_loop_config: Optional[Dict] = None,
        backend_config: Optional[BackendConfig] = None,
        scaling_config: Optional[ScalingConfig] = None,
        dataset_config: Optional[Dict[str, DatasetConfig]] = None,
        run_config: Optional[RunConfig] = None,
        datasets: Optional[Dict[str, GenDataset]] = None,
        preprocessor: Optional["Preprocessor"] = None,
        resume_from_checkpoint: Optional[Checkpoint] = None,
    ):
        if not ray.is_initialized():
            ray.init()

        self._train_loop_per_worker = train_loop_per_worker
        self._train_loop_config = train_loop_config

        backend_config = (
            backend_config if backend_config is not None else BackendConfig()
        )
        self._backend_config = backend_config
        self._dataset_config = DatasetConfig.validated(
            DatasetConfig.merge(self._dataset_config, dataset_config), datasets
        )
        self._ingest_spec = DataParallelIngestSpec(
            dataset_config=self._dataset_config,
        )

        super(DataParallelTrainer, self).__init__(
            scaling_config=scaling_config,
            run_config=run_config,
            datasets=datasets,
            preprocessor=preprocessor,
            resume_from_checkpoint=resume_from_checkpoint,
        )