Exemplo n.º 1
0
        def predict(
            X_test: List[str],
            test_batch_size: int,
            model_cls: Any,
            model_params: Dict[str, Any],
            labels: List[str],
            checkpoint: Union[bytes, Path],
            checkpoint_name: Optional[str],
            master_ip: str,
            gobbli_dir: Optional[Path] = None,
            log_level: Union[int, str] = logging.WARNING,
            distributed: bool = False,
        ) -> pd.DataFrame:

            logger = init_worker_env(gobbli_dir=gobbli_dir,
                                     log_level=log_level)
            use_gpu, nvidia_visible_devices = init_gpu_config()

            worker_ip = get_worker_ip()
            if not distributed and worker_ip != master_ip:
                raise RuntimeError(
                    "Experiments must be started with distributed = True to run "
                    "tasks on remote workers.")

            clf = model_cls(
                **model_params,
                use_gpu=use_gpu,
                nvidia_visible_devices=nvidia_visible_devices,
                logger=logger,
            )

            # This step isn't necessary in all cases if the build step just downloads
            # pretrained weights we weren't going to use anyway, but sometimes it's needed
            # Ex. for BERT to download vocabulary files and config
            clf.build()

            # Use the current working directory (CWD) as the base for the tempdir, under the
            # assumption that the CWD is included in any bind mounts/volumes the user may have
            # created if they're running this in a Docker container
            # If it's not part of a host mount, the files won't be mounted properly in the container
            with tempfile.TemporaryDirectory(dir=".") as tempdir:
                tempdir_path = Path(tempdir)

                checkpoint_path = None  # type: Optional[Path]
                if isinstance(checkpoint, bytes):
                    if checkpoint_name is not None:
                        blob_to_dir(checkpoint, tempdir_path)
                        checkpoint_path = tempdir_path / checkpoint_name
                elif isinstance(checkpoint, Path):
                    checkpoint_path = checkpoint
                elif checkpoint is None:
                    pass
                else:
                    raise TypeError(
                        f"invalid checkpoint type: '{type(checkpoint)}'")

                predict_input = gobbli.io.PredictInput(
                    X=X_test,
                    labels=labels,
                    checkpoint=checkpoint_path,
                    predict_batch_size=test_batch_size,
                )
                predict_output = clf.predict(predict_input)

            return predict_output.y_pred_proba
Exemplo n.º 2
0
        def train(
            X_train: Any,
            y_train: Any,
            X_valid: Any,
            y_valid: Any,
            train_batch_size: int,
            valid_batch_size: int,
            num_train_epochs: int,
            model_cls: Any,
            model_params: Dict[str, Any],
            master_ip: str,
            gobbli_dir: Optional[Path] = None,
            log_level: Union[int, str] = logging.WARNING,
            distributed: bool = False,
        ) -> RemoteTrainResult:

            logger = init_worker_env(gobbli_dir=gobbli_dir,
                                     log_level=log_level)
            use_gpu, nvidia_visible_devices = init_gpu_config()

            worker_ip = get_worker_ip()
            if not distributed and worker_ip != master_ip:
                raise RuntimeError(
                    "Experiments must be started with distributed = True to run "
                    "tasks on remote workers.")

            clf = model_cls(
                **model_params,
                use_gpu=use_gpu,
                nvidia_visible_devices=nvidia_visible_devices,
                logger=logger,
            )

            clf.build()

            train_input = gobbli.io.TrainInput(
                X_train=X_train,
                y_train=y_train,
                X_valid=X_valid,
                y_valid=y_valid,
                train_batch_size=train_batch_size,
                valid_batch_size=valid_batch_size,
                num_train_epochs=num_train_epochs,
            )
            train_output = clf.train(train_input)
            checkpoint = train_output.checkpoint
            checkpoint_name = getattr(checkpoint, "name", None)

            if distributed:
                # Copy weights into the object store, since we don't share a filesystem
                # with the master node
                checkpoint = (dir_to_blob(checkpoint.parent)
                              if checkpoint is not None else None)

            if not is_ray_local_mode():
                checkpoint = ray.put(checkpoint)

            return RemoteTrainResult(
                metadata=train_output.metadata(),
                labels=train_output.labels,
                checkpoint_name=checkpoint_name,
                checkpoint_id=checkpoint,
                model_params=model_params,
                ip_address=worker_ip,
            )