def train( X_train: Any, y_train: Any, X_valid: Any, y_valid: Any, train_batch_size: int, valid_batch_size: int, num_train_epochs: int, model_cls: Any, model_params: Dict[str, Any], master_ip: str, gobbli_dir: Optional[Path] = None, log_level: Union[int, str] = logging.WARNING, distributed: bool = False, ) -> RemoteTrainResult: logger = init_worker_env(gobbli_dir=gobbli_dir, log_level=log_level) use_gpu, nvidia_visible_devices = init_gpu_config() worker_ip = get_worker_ip() if not distributed and worker_ip != master_ip: raise RuntimeError( "Experiments must be started with distributed = True to run " "tasks on remote workers.") clf = model_cls( **model_params, use_gpu=use_gpu, nvidia_visible_devices=nvidia_visible_devices, logger=logger, ) clf.build() train_input = gobbli.io.TrainInput( X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, train_batch_size=train_batch_size, valid_batch_size=valid_batch_size, num_train_epochs=num_train_epochs, ) train_output = clf.train(train_input) checkpoint = train_output.checkpoint checkpoint_name = getattr(checkpoint, "name", None) if distributed: # Copy weights into the object store, since we don't share a filesystem # with the master node checkpoint = (dir_to_blob(checkpoint.parent) if checkpoint is not None else None) if not is_ray_local_mode(): checkpoint = ray.put(checkpoint) return RemoteTrainResult( metadata=train_output.metadata(), labels=train_output.labels, checkpoint_name=checkpoint_name, checkpoint_id=checkpoint, model_params=model_params, ip_address=worker_ip, )
def predict( X_test: List[str], test_batch_size: int, model_cls: Any, model_params: Dict[str, Any], labels: List[str], checkpoint: Union[bytes, Path], checkpoint_name: Optional[str], master_ip: str, gobbli_dir: Optional[Path] = None, log_level: Union[int, str] = logging.WARNING, distributed: bool = False, ) -> pd.DataFrame: logger = init_worker_env(gobbli_dir=gobbli_dir, log_level=log_level) use_gpu, nvidia_visible_devices = init_gpu_config() worker_ip = get_worker_ip() if not distributed and worker_ip != master_ip: raise RuntimeError( "Experiments must be started with distributed = True to run " "tasks on remote workers.") clf = model_cls( **model_params, use_gpu=use_gpu, nvidia_visible_devices=nvidia_visible_devices, logger=logger, ) # This step isn't necessary in all cases if the build step just downloads # pretrained weights we weren't going to use anyway, but sometimes it's needed # Ex. for BERT to download vocabulary files and config clf.build() # Use the current working directory (CWD) as the base for the tempdir, under the # assumption that the CWD is included in any bind mounts/volumes the user may have # created if they're running this in a Docker container # If it's not part of a host mount, the files won't be mounted properly in the container with tempfile.TemporaryDirectory(dir=".") as tempdir: tempdir_path = Path(tempdir) checkpoint_path = None # type: Optional[Path] if isinstance(checkpoint, bytes): if checkpoint_name is not None: blob_to_dir(checkpoint, tempdir_path) checkpoint_path = tempdir_path / checkpoint_name elif isinstance(checkpoint, Path): checkpoint_path = checkpoint elif checkpoint is None: pass else: raise TypeError( f"invalid checkpoint type: '{type(checkpoint)}'") predict_input = gobbli.io.PredictInput( X=X_test, labels=labels, checkpoint=checkpoint_path, predict_batch_size=test_batch_size, ) predict_output = clf.predict(predict_input) return predict_output.y_pred_proba
def run( self, dataset_split: Optional[Union[Tuple[float, float], Tuple[float, float, float]]] = None, seed: int = 1, train_batch_size: int = 32, valid_batch_size: int = 32, test_batch_size: int = 32, num_train_epochs: int = 5, ) -> ClassificationExperimentResults: """ Run the experiment. Args: dataset_split: A tuple describing the proportion of the dataset to be added to the train/validation/test splits. If the experiment uses an explicit test set (passes :paramref:`BaseExperiment.params.test_dataset`), this should be a 2-tuple describing the train/validation split. Otherwise, it should be a 3-tuple describing the train/validation/test split. The tuple must sum to 1. seed: Random seed to be used for dataset splitting for reproducibility. train_batch_size: Number of observations per batch on the training dataset. valid_batch_size: Number of observations per batch on the validation dataset. test_batch_size: Number of observations per batch on the test dataset. num_train_epochs: Number of epochs to use for training. Returns: The results of the experiment. """ _dataset_split = dataset_split # If the user didn't pass an explicit test set, create one # using a split if self.X_test is None: if _dataset_split is None: _dataset_split = ( ClassificationExperiment._DEFAULT_TRAIN_VALID_TEST_SPLIT) ClassificationExperiment._validate_split(_dataset_split, expected_len=3) # cast needed to satisfy mypy train_prop, valid_prop, test_prop = cast( Tuple[float, float, float], _dataset_split) train_valid_prop = train_prop + valid_prop X_train_valid, X_test, y_train_valid, y_test = train_test_split( self.X, self.y, train_size=train_valid_prop, test_size=test_prop) else: if _dataset_split is None: _dataset_split = ClassificationExperiment._DEFAULT_TRAIN_VALID_SPLIT ClassificationExperiment._validate_split(_dataset_split, expected_len=2) # cast needed to satisfy mypy train_prop, valid_prop = cast(Tuple[float, float], _dataset_split) train_valid_prop = 1 X_train_valid, y_train_valid = self.X, self.y X_test, y_test = self.X_test, self.y_test X_train, X_valid, y_train, y_valid = train_test_split( X_train_valid, y_train_valid, # Round to prevent floating point imprecision errors train_size=round(train_prop / train_valid_prop, 4), test_size=round(valid_prop / train_valid_prop, 4), ) if self.param_grid is not None: for param, values in self.param_grid.items(): if isinstance(values, str): raise TypeError( f"String detected in parameter grid values for parameter '{param}'. " "This will be treated as a list of character parameter values, " "which probably isn't what you want. If you're really sure, " "convert the string to a list of characters and try again." ) grid = ParameterGrid(self.param_grid) if len(grid) == 0: raise ValueError("empty parameter grid") # Transfer datasets to the Ray distributed object store # if not running in local mode # In local mode, this causes problems: https://github.com/ray-project/ray/issues/5379 if is_ray_local_mode(): dataset_ids = [X_train, y_train, X_valid, y_valid] else: dataset_ids = [ ray.put(d) for d in (X_train, y_train, X_valid, y_valid) ] # Return the checkpoint blob separately from the train result so it doesn't # have to be copied to the object store again when used by the predict function @ray.remote(num_cpus=self.task_num_cpus, num_gpus=self.task_num_gpus) def train( X_train: Any, y_train: Any, X_valid: Any, y_valid: Any, train_batch_size: int, valid_batch_size: int, num_train_epochs: int, model_cls: Any, model_params: Dict[str, Any], master_ip: str, gobbli_dir: Optional[Path] = None, log_level: Union[int, str] = logging.WARNING, distributed: bool = False, ) -> RemoteTrainResult: logger = init_worker_env(gobbli_dir=gobbli_dir, log_level=log_level) use_gpu, nvidia_visible_devices = init_gpu_config() worker_ip = get_worker_ip() if not distributed and worker_ip != master_ip: raise RuntimeError( "Experiments must be started with distributed = True to run " "tasks on remote workers.") clf = model_cls( **model_params, use_gpu=use_gpu, nvidia_visible_devices=nvidia_visible_devices, logger=logger, ) clf.build() train_input = gobbli.io.TrainInput( X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, train_batch_size=train_batch_size, valid_batch_size=valid_batch_size, num_train_epochs=num_train_epochs, ) train_output = clf.train(train_input) checkpoint = train_output.checkpoint checkpoint_name = getattr(checkpoint, "name", None) if distributed: # Copy weights into the object store, since we don't share a filesystem # with the master node checkpoint = (dir_to_blob(checkpoint.parent) if checkpoint is not None else None) if not is_ray_local_mode(): checkpoint = ray.put(checkpoint) return RemoteTrainResult( metadata=train_output.metadata(), labels=train_output.labels, checkpoint_name=checkpoint_name, checkpoint_id=checkpoint, model_params=model_params, ip_address=worker_ip, ) @ray.remote(num_cpus=self.task_num_cpus, num_gpus=self.task_num_gpus) def predict( X_test: List[str], test_batch_size: int, model_cls: Any, model_params: Dict[str, Any], labels: List[str], checkpoint: Union[bytes, Path], checkpoint_name: Optional[str], master_ip: str, gobbli_dir: Optional[Path] = None, log_level: Union[int, str] = logging.WARNING, distributed: bool = False, ) -> pd.DataFrame: logger = init_worker_env(gobbli_dir=gobbli_dir, log_level=log_level) use_gpu, nvidia_visible_devices = init_gpu_config() worker_ip = get_worker_ip() if not distributed and worker_ip != master_ip: raise RuntimeError( "Experiments must be started with distributed = True to run " "tasks on remote workers.") clf = model_cls( **model_params, use_gpu=use_gpu, nvidia_visible_devices=nvidia_visible_devices, logger=logger, ) # This step isn't necessary in all cases if the build step just downloads # pretrained weights we weren't going to use anyway, but sometimes it's needed # Ex. for BERT to download vocabulary files and config clf.build() # Use the current working directory (CWD) as the base for the tempdir, under the # assumption that the CWD is included in any bind mounts/volumes the user may have # created if they're running this in a Docker container # If it's not part of a host mount, the files won't be mounted properly in the container with tempfile.TemporaryDirectory(dir=".") as tempdir: tempdir_path = Path(tempdir) checkpoint_path = None # type: Optional[Path] if isinstance(checkpoint, bytes): if checkpoint_name is not None: blob_to_dir(checkpoint, tempdir_path) checkpoint_path = tempdir_path / checkpoint_name elif isinstance(checkpoint, Path): checkpoint_path = checkpoint elif checkpoint is None: pass else: raise TypeError( f"invalid checkpoint type: '{type(checkpoint)}'") predict_input = gobbli.io.PredictInput( X=X_test, labels=labels, checkpoint=checkpoint_path, predict_batch_size=test_batch_size, ) predict_output = clf.predict(predict_input) return predict_output.y_pred_proba # Record the IP address of the master node so workers can detect # whether they're remote and not running in distributed mode, at which # point they should raise an error master_ip = get_worker_ip() # Run training in parallel using the Ray cluster raw_results = ray.get([ train.remote( *dataset_ids, train_batch_size, valid_batch_size, num_train_epochs, self.model_cls, params, master_ip, self.worker_gobbli_dir, self.worker_log_level, self.distributed, ) for params in grid ]) training_results = [] # type: List[Dict[str, Any]] best_valid_loss = math.inf best_result = None # type: Optional[RemoteTrainResult] best_checkpoint_id = None # type: Optional[ray.ObjectID] for train_results in raw_results: result = { **train_results.metadata, "node_ip_address": train_results.ip_address, "model_params": train_results.model_params, } if result["valid_loss"] < best_valid_loss: best_result = train_results best_checkpoint_id = train_results.checkpoint_id best_valid_loss = result["valid_loss"] training_results.append(result) if best_result is None: raise ValueError( "failed to find parameter combination with finite validation loss" ) # Evaluate the best model on the test set if is_ray_local_mode(): X_test_id = X_test else: X_test_id = ray.put(X_test) y_pred_proba = ray.get( predict.remote( X_test_id, test_batch_size, self.model_cls, best_result.model_params, best_result.labels, best_checkpoint_id, best_result.checkpoint_name, master_ip, self.worker_gobbli_dir, self.worker_log_level, self.distributed, )) best_checkpoint = best_checkpoint_id if not is_ray_local_mode(): best_checkpoint = ray.get(best_checkpoint_id) return ClassificationExperimentResults( X=X_test, labels=best_result.labels, # type: ignore y_true=y_test, y_pred_proba=y_pred_proba, training_results=training_results, best_model_checkpoint=cast(Union[bytes, Path], best_checkpoint), best_model_checkpoint_name=best_result.checkpoint_name, )