def predict( X_test: List[str], test_batch_size: int, model_cls: Any, model_params: Dict[str, Any], labels: List[str], checkpoint: Union[bytes, Path], checkpoint_name: Optional[str], master_ip: str, gobbli_dir: Optional[Path] = None, log_level: Union[int, str] = logging.WARNING, distributed: bool = False, ) -> pd.DataFrame: logger = init_worker_env(gobbli_dir=gobbli_dir, log_level=log_level) use_gpu, nvidia_visible_devices = init_gpu_config() worker_ip = get_worker_ip() if not distributed and worker_ip != master_ip: raise RuntimeError( "Experiments must be started with distributed = True to run " "tasks on remote workers.") clf = model_cls( **model_params, use_gpu=use_gpu, nvidia_visible_devices=nvidia_visible_devices, logger=logger, ) # This step isn't necessary in all cases if the build step just downloads # pretrained weights we weren't going to use anyway, but sometimes it's needed # Ex. for BERT to download vocabulary files and config clf.build() # Use the current working directory (CWD) as the base for the tempdir, under the # assumption that the CWD is included in any bind mounts/volumes the user may have # created if they're running this in a Docker container # If it's not part of a host mount, the files won't be mounted properly in the container with tempfile.TemporaryDirectory(dir=".") as tempdir: tempdir_path = Path(tempdir) checkpoint_path = None # type: Optional[Path] if isinstance(checkpoint, bytes): if checkpoint_name is not None: blob_to_dir(checkpoint, tempdir_path) checkpoint_path = tempdir_path / checkpoint_name elif isinstance(checkpoint, Path): checkpoint_path = checkpoint elif checkpoint is None: pass else: raise TypeError( f"invalid checkpoint type: '{type(checkpoint)}'") predict_input = gobbli.io.PredictInput( X=X_test, labels=labels, checkpoint=checkpoint_path, predict_batch_size=test_batch_size, ) predict_output = clf.predict(predict_input) return predict_output.y_pred_proba
def train( X_train: Any, y_train: Any, X_valid: Any, y_valid: Any, train_batch_size: int, valid_batch_size: int, num_train_epochs: int, model_cls: Any, model_params: Dict[str, Any], master_ip: str, gobbli_dir: Optional[Path] = None, log_level: Union[int, str] = logging.WARNING, distributed: bool = False, ) -> RemoteTrainResult: logger = init_worker_env(gobbli_dir=gobbli_dir, log_level=log_level) use_gpu, nvidia_visible_devices = init_gpu_config() worker_ip = get_worker_ip() if not distributed and worker_ip != master_ip: raise RuntimeError( "Experiments must be started with distributed = True to run " "tasks on remote workers.") clf = model_cls( **model_params, use_gpu=use_gpu, nvidia_visible_devices=nvidia_visible_devices, logger=logger, ) clf.build() train_input = gobbli.io.TrainInput( X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, train_batch_size=train_batch_size, valid_batch_size=valid_batch_size, num_train_epochs=num_train_epochs, ) train_output = clf.train(train_input) checkpoint = train_output.checkpoint checkpoint_name = getattr(checkpoint, "name", None) if distributed: # Copy weights into the object store, since we don't share a filesystem # with the master node checkpoint = (dir_to_blob(checkpoint.parent) if checkpoint is not None else None) if not is_ray_local_mode(): checkpoint = ray.put(checkpoint) return RemoteTrainResult( metadata=train_output.metadata(), labels=train_output.labels, checkpoint_name=checkpoint_name, checkpoint_id=checkpoint, model_params=model_params, ip_address=worker_ip, )