def eval_fn( predictor_kwargs: Dict[str, Any] = None, model_ref: ObjectRef = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() try: initialize_pytorch(horovod=hvd) eval_shard = RayDatasetShard( rt.get_dataset_shard("eval"), features, training_set_metadata, ) model = ray.get(model_ref) device = get_torch_device() model = model.to(device) predictor = RemotePredictor(model=model, horovod=hvd, report_tqdm_to_ray=True, **predictor_kwargs) return predictor.batch_evaluation(eval_shard, **kwargs) finally: torch.cuda.empty_cache() hvd.shutdown()
def test_initialize_pytorch_with_gpu_int(mock_torch): mock_torch.cuda.is_available.return_value = True mock_torch.cuda.device_count.return_value = 4 with clean_params(): initialize_pytorch(gpus=1) mock_torch.cuda.set_device.assert_called_with(1) assert "CUDA_VISIBLE_DEVICES" not in os.environ
def tune_learning_rate_fn( dataset: RayDataset, config: Dict[str, Any], data_loader_kwargs: Dict[str, Any] = None, executable_kwargs: Dict[str, Any] = None, model: ECD = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ) -> float: # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() try: initialize_pytorch(horovod=hvd) pipe = dataset.pipeline(shuffle=False, **data_loader_kwargs) train_shard = RayDatasetShard( pipe, features, training_set_metadata, ) device = get_torch_device() model = model.to(device) trainer = RemoteTrainer(model=model, horovod=hvd, **executable_kwargs) return trainer.tune_learning_rate(config, train_shard, **kwargs) finally: torch.cuda.empty_cache() hvd.shutdown()
def test_initialize_pytorch_with_gpu_list(mock_torch): # For test purposes, these devices can be anything, we just need to be able to uniquely # identify them. mock_torch.cuda.is_available.return_value = True mock_torch.cuda.device_count.return_value = 4 with clean_params(): initialize_pytorch(gpus=[1, 2]) assert os.environ["CUDA_VISIBLE_DEVICES"] == "1,2"
def __init__(self, model: ECD, gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, **kwargs): horovod = initialize_horovod() initialize_pytorch( gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, horovod=horovod ) super().__init__(model, horovod=horovod, **kwargs) # Only return results from rank 0 to reduce network overhead self.batch_predict = return_first(self.batch_predict) self.batch_evaluation = return_first(self.batch_evaluation)
def test_initialize_pytorch_with_horovod_explicit_gpus(mock_torch): mock_torch.cuda.is_available.return_value = True mock_torch.cuda.device_count.return_value = 4 mock_hvd = Mock() mock_hvd.local_rank.return_value = 1 mock_hvd.local_size.return_value = 4 with clean_params(): initialize_pytorch(gpus="-1", horovod=mock_hvd) assert os.environ["CUDA_VISIBLE_DEVICES"] == ""
def test_initialize_pytorch_with_horovod(mock_torch): mock_torch.cuda.is_available.return_value = True mock_torch.cuda.device_count.return_value = 4 mock_hvd = Mock() mock_hvd.local_rank.return_value = 1 mock_hvd.local_size.return_value = 4 with clean_params(): initialize_pytorch(horovod=mock_hvd) mock_torch.cuda.set_device.assert_called_with(1) assert "CUDA_VISIBLE_DEVICES" not in os.environ
def train_fn( executable_kwargs: Dict[str, Any] = None, model: "LudwigModel" = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() initialize_pytorch(horovod=hvd) train_shard = RayDatasetShard( rt.get_dataset_shard("train"), features, training_set_metadata, ) try: val_shard = rt.get_dataset_shard("val") except KeyError: val_shard = None if val_shard is not None: val_shard = RayDatasetShard( val_shard, features, training_set_metadata, ) try: test_shard = rt.get_dataset_shard("test") except KeyError: test_shard = None if test_shard is not None: test_shard = RayDatasetShard( test_shard, features, training_set_metadata, ) trainer = RemoteTrainer(model=model, **executable_kwargs) results = trainer.train(train_shard, val_shard, test_shard, **kwargs) # TODO(shreya): Figure out GPU memory leak # TODO(shreya): Check if placing model off GPU explicitly makes a difference # Clear CUDA memory, place model on CPU, return model to user # torch.cuda.empty_cache() # model.cpu() return results, trainer.validation_field, trainer.validation_metric
def test_initialize_pytorch_with_horovod_bad_local_rank(mock_torch, mock_warnings): """In this scenario, the local_size 5 is out of the bounds of the GPU indices.""" mock_torch.cuda.is_available.return_value = True mock_torch.cuda.device_count.return_value = 4 mock_hvd = Mock() mock_hvd.local_rank.return_value = 1 mock_hvd.local_size.return_value = 5 with clean_params(): initialize_pytorch(horovod=mock_hvd) assert os.environ["CUDA_VISIBLE_DEVICES"] == "" mock_warnings.warn.assert_called()
def test_initialize_pytorch_only_once(mock_torch): mock_torch.cuda.is_available.return_value = True mock_torch.cuda.device_count.return_value = 4 with clean_params(): # During first time initialization, set pytorch parallelism initialize_pytorch(allow_parallel_threads=False) mock_torch.set_num_threads.assert_called_once() mock_torch.set_num_interop_threads.assert_called_once() # Reset call counts on all threading calls mock_torch.reset_mock() # In the second call to initialization, avoid calling these methods again, as pytorch # will raise an exception initialize_pytorch(allow_parallel_threads=False) mock_torch.set_num_threads.assert_not_called() mock_torch.set_num_interop_threads.assert_not_called() # No GPUs were specified, so this should not have been called even once mock_torch.cuda.memory.set_per_process_memory_fraction.assert_not_called()
def legacy_train_fn( trainer: RemoteTrainer = None, remote_model: "LudwigModel" = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, train_shards: List[DatasetPipeline] = None, val_shards: List[DatasetPipeline] = None, test_shards: List[DatasetPipeline] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() initialize_pytorch(horovod=hvd) train_shard = RayDatasetShard( train_shards[hvd.rank()], features, training_set_metadata, ) val_shard = val_shards[hvd.rank()] if val_shards else None if val_shard is not None: val_shard = RayDatasetShard( val_shard, features, training_set_metadata, ) test_shard = test_shards[hvd.rank()] if test_shards else None if test_shard is not None: test_shard = RayDatasetShard( test_shard, features, training_set_metadata, ) results = trainer.train(train_shard, val_shard, test_shard, **kwargs) return results
def initialize_pytorch(self, *args, **kwargs): initialize_pytorch(*args, horovod=self._horovod, **kwargs)
def memory_tune_config(config, dataset, model_category, row_count): fits_in_memory = False tried_reduce_seq_len = False raw_config = merge_with_defaults(config) training_set_metadata = get_trainingset_metadata(raw_config, dataset) modified_hyperparam_search_space = copy.deepcopy( raw_config[HYPEROPT]["parameters"]) current_param_values = {} param_list = [] model_type = get_model_type(raw_config) if model_type in RANKED_MODIFIABLE_PARAM_LIST: params_to_modify = RANKED_MODIFIABLE_PARAM_LIST[model_type] if len(params_to_modify.keys()) > 0: param_list = list(params_to_modify.keys()) max_memory = _get_machine_memory() initialize_pytorch() while param_list: # compute memory utilization current_param_values = get_new_params( current_param_values, modified_hyperparam_search_space, params_to_modify) temp_config = sub_new_params(raw_config, current_param_values) mem_use = compute_memory_usage(temp_config, training_set_metadata, model_category) if mem_use > max_memory and model_category == TEXT and not tried_reduce_seq_len: tried_reduce_seq_len = True if reduce_text_feature_max_length(config, training_set_metadata): reduce_text_feature_max_length(temp_config, training_set_metadata) mem_use = compute_memory_usage(temp_config, training_set_metadata, model_category) logging.info( f"Checking model estimated mem use {mem_use} against memory size {max_memory}" ) if mem_use <= max_memory: fits_in_memory = True break # check if we have exhausted tuning of current param (e.g. we can no longer reduce the param value) param, min_value = param_list[0], params_to_modify[param_list[0]] if param in modified_hyperparam_search_space.keys(): param_space = modified_hyperparam_search_space[param]["space"] if param_space == "choice": if (len(modified_hyperparam_search_space[param]["categories"]) >= 2 and modified_hyperparam_search_space[param] ["categories"][-2] >= min_value): modified_hyperparam_search_space[param][ "categories"] = modified_hyperparam_search_space[ param]["categories"][:-1] else: param_list.pop(0) # exhausted reduction of this parameter else: # reduce by 10% upper_bound, lower_bound = ( modified_hyperparam_search_space[param]["upper"], modified_hyperparam_search_space[param]["lower"], ) reduction_val = (upper_bound - lower_bound) * 0.1 new_upper_bound = upper_bound - reduction_val if (new_upper_bound ) > lower_bound and new_upper_bound > min_value: modified_hyperparam_search_space[param][ "upper"] = new_upper_bound else: param_list.pop(0) # exhausted reduction of this parameter else: param_list.pop(0) # param not in hyperopt search space if model_category == TEXT and row_count > AUTOML_LARGE_TEXT_DATASET: if "checkpoints_per_epoch" not in config[ TRAINER] and "steps_per_checkpoint" not in config[TRAINER]: checkpoints_per_epoch = max( 2, math.floor(row_count / AUTOML_MAX_ROWS_PER_CHECKPOINT)) config[TRAINER][ "checkpoints_per_epoch"] = checkpoints_per_epoch # decrease latency to get model accuracy signal if "evaluate_training_set" not in config[TRAINER]: config[TRAINER][ "evaluate_training_set"] = False # reduce overhead for increased evaluation frequency if not fits_in_memory: # Switch to smaller pre-trained model encoder for large datasets. _update_text_encoder(config["input_features"], AUTOML_DEFAULT_TEXT_ENCODER, AUTOML_SMALLER_TEXT_ENCODER) modified_config = copy.deepcopy(config) modified_config[HYPEROPT]["parameters"] = modified_hyperparam_search_space modified_config[HYPEROPT]["executor"]["num_samples"] = _update_num_samples( modified_config[HYPEROPT]["executor"]["num_samples"], modified_hyperparam_search_space) return modified_config, fits_in_memory
def test_initialize_pytorch_without_gpu(mock_torch): mock_torch.cuda.is_available.return_value = True mock_torch.cuda.device_count.return_value = 4 with clean_params(): initialize_pytorch(gpus=-1) assert os.environ["CUDA_VISIBLE_DEVICES"] == ""
def train_fn( executable_kwargs: Dict[str, Any] = None, model_ref: ObjectRef = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() try: initialize_pytorch(horovod=hvd) train_shard = RayDatasetShard( rt.get_dataset_shard("train"), features, training_set_metadata, ) try: val_shard = rt.get_dataset_shard("val") except KeyError: val_shard = None if val_shard is not None: val_shard = RayDatasetShard( val_shard, features, training_set_metadata, ) try: test_shard = rt.get_dataset_shard("test") except KeyError: test_shard = None if test_shard is not None: test_shard = RayDatasetShard( test_shard, features, training_set_metadata, ) model = ray.get(model_ref) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) trainer = RemoteTrainer(model=model, horovod=hvd, **executable_kwargs) results = trainer.train(train_shard, val_shard, test_shard, **kwargs) if results is not None: # only return the model state dict back to the head node. trained_model, *args = results results = (trained_model.cpu().state_dict(), *args) torch.cuda.empty_cache() train_results = results, trainer.validation_field, trainer.validation_metric finally: hvd.shutdown() return train_results
def initialize_pytorch(self, **kwargs): # Make sure we don't claim any GPU resources on the head node initialize_pytorch(gpus=-1) self._pytorch_kwargs = kwargs
def memory_tune_config(config, dataset, model_category, row_count): fits_in_memory = False raw_config = merge_with_defaults(config) training_set_metadata = get_trainingset_metadata(raw_config, dataset) modified_hyperparam_search_space = copy.deepcopy( raw_config[HYPEROPT]["parameters"]) current_param_values = {} param_list = [] model_type = get_model_type(raw_config) if model_type in RANKED_MODIFIABLE_PARAM_LIST: params_to_modify = RANKED_MODIFIABLE_PARAM_LIST[model_type] if len(params_to_modify.keys()) > 0: param_list = list(params_to_modify.keys()) max_memory = _get_machine_memory() initialize_pytorch() while param_list: # compute memory utilization current_param_values = get_new_params( current_param_values, modified_hyperparam_search_space, params_to_modify) temp_config = sub_new_params(raw_config, current_param_values) mem_use = compute_memory_usage(temp_config, training_set_metadata) logging.info( f"Checking model estimated mem use {mem_use} against memory size {max_memory}" ) if mem_use <= max_memory: fits_in_memory = True break # check if we have exhausted tuning of current param (e.g. we can no longer reduce the param value) param, min_value = param_list[0], params_to_modify[param_list[0]] if param in modified_hyperparam_search_space.keys(): param_space = modified_hyperparam_search_space[param]["space"] if param_space == "choice": if (len(modified_hyperparam_search_space[param]["categories"]) >= 2 and modified_hyperparam_search_space[param] ["categories"][-2] >= min_value): modified_hyperparam_search_space[param][ "categories"] = modified_hyperparam_search_space[ param]["categories"][:-1] else: param_list.pop(0) # exhausted reduction of this parameter else: # reduce by 10% upper_bound, lower_bound = ( modified_hyperparam_search_space[param]["upper"], modified_hyperparam_search_space[param]["lower"], ) reduction_val = (upper_bound - lower_bound) * 0.1 new_upper_bound = upper_bound - reduction_val if (new_upper_bound ) > lower_bound and new_upper_bound > min_value: modified_hyperparam_search_space[param][ "upper"] = new_upper_bound else: param_list.pop(0) # exhausted reduction of this parameter else: param_list.pop(0) # param not in hyperopt search space if not fits_in_memory and model_category == TEXT: reduce_text_model_mem(config, training_set_metadata, row_count) modified_config = copy.deepcopy(config) modified_config[HYPEROPT]["parameters"] = modified_hyperparam_search_space return modified_config, fits_in_memory
def initialize_pytorch(self, *args, **kwargs): initialize_pytorch(*args, **kwargs)