def test_initialize_tensorflow_without_gpu(mock_tf_config): mock_tf_config.list_physical_devices.return_value = [ 'gpu0', 'gpu1', 'gpu2', 'gpu3' ] with clean_params(): initialize_tensorflow(gpus=-1) mock_tf_config.set_visible_devices.assert_called_with([], 'GPU')
def test_initialize_tensorflow_only_once(mock_tf_config): mock_tf_config.list_physical_devices.return_value = [ 'gpu0', 'gpu1', 'gpu2', 'gpu3' ] with clean_params(): # During first time initialization, set TensorFlow parallelism initialize_tensorflow() mock_tf_config.threading.set_intra_op_parallelism_threads.assert_called_once( ) mock_tf_config.threading.set_inter_op_parallelism_threads.assert_called_once( ) # Reset call counts on all threading calls mock_tf_config.threading.reset_mock() # In the second call to initialization, avoid calling these methods again, as TensorFlow # will raise an exception initialize_tensorflow() mock_tf_config.threading.set_intra_op_parallelism_threads.assert_not_called( ) mock_tf_config.threading.set_inter_op_parallelism_threads.assert_not_called( ) # No GPUs were specified, so this should not have been called even once mock_tf_config.set_visible_devices.assert_not_called()
def test_initialize_tensorflow_with_gpu_list(mock_tf_config): # For test purposes, these devices can be anything, we just need to be able to uniquely # identify them. mock_tf_config.list_physical_devices.return_value = [ 'gpu0', 'gpu1', 'gpu2', 'gpu3' ] with clean_params(): initialize_tensorflow(gpus=[1, 2]) mock_tf_config.set_visible_devices.assert_called_with(['gpu1', 'gpu2'], 'GPU')
def init_tensorflow_cpu(request): """Initialize tensorflow at the start of testing to only use CPUs. This fixture runs once before any tests, and ensures that the main process running the pytests does not claim any GPU resources. This is critical to avoid OOM errors when running subprocesses that need GPUs (e.g., hyperopt), as otherwise the main process will consume all the memory and cause the subprocesses to crash. """ initialize_tensorflow(gpus=-1)
def __init__(self, model_definition, logging_level=logging.ERROR, use_horovod=None, gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, random_seed=default_random_seed): """ :param model_definition: (dict, string) in-memory representation of model definition or string path to the saved JSON model definition file. :param model_definition_fp: (string) path to user-defined definition YAML file. :param logging_level: Log level that will be sent to stderr. :param use_horovod: (bool) use Horovod for distributed training. Will be set automatically if `horovodrun` is used to launch the training script. :param gpus: (string, default: `None`) list of GPUs to use (it uses the same syntax of CUDA_VISIBLE_DEVICES) :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate per GPU device. :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use multithreading parallelism to improve performance at the cost of determinism. """ # check if model definition is a path or a dict if isinstance(model_definition, str): # assume path with open(model_definition, 'r') as def_file: model_definition_dict = yaml.safe_load(def_file) self.model_definition_fp = model_definition else: model_definition_dict = copy.deepcopy(model_definition) self.model_definition_fp = None # merge model definition with defaults self.model_definition = merge_with_defaults(model_definition_dict) # setup horovod self._horovod = configure_horovod(use_horovod) # setup logging self.set_logging_level(logging_level) # setup TensorFlow initialize_tensorflow(gpus, gpu_memory_limit, allow_parallel_threads, self._horovod) # todo refactoring: decide where to put this, # here or at the beginning of training. # Either way make sure it is called before the model is initialized. # tf.random.set_seed(random_seed) # setup model self.model = None self.training_set_metadata = None # online training state self._online_trainer = None
def test_initialize_tensorflow_with_horovod_explicit_gpus(mock_tf_config): mock_tf_config.list_physical_devices.return_value = [ 'gpu0', 'gpu1', 'gpu2', 'gpu3' ] mock_hvd = Mock() mock_hvd.local_rank.return_value = 1 mock_hvd.local_size.return_value = 4 with clean_params(): initialize_tensorflow(gpus='-1', horovod=mock_hvd) mock_tf_config.set_visible_devices.assert_called_with([], 'GPU')
def __init__(self, gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, **kwargs): horovod = initialize_horovod() initialize_tensorflow(gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, horovod=horovod) super().__init__(horovod=horovod, **kwargs) # Only return results from rank 0 to reduce network overhead self.batch_predict = return_first(self.batch_predict) self.batch_evaluation = return_first(self.batch_evaluation)
def init_tensorflow_cpu(request): """Initialize tensorflow at the start of testing to only use CPUs. This fixture runs once before any tests, and ensures that the main process running the pytests does not claim any GPU resources. This is critical to avoid OOM errors when running subprocesses that need GPUs (e.g., hyperopt), as otherwise the main process will consume all the memory and cause the subprocesses to crash. Run most tests eagerly as the cost of graph construction can easily increase runtime by and order of magnitude for small tests. Tests that execute in subprocesses, and tests in `test_graph_execution.py` still run in graph mode. """ import tensorflow as tf tf.config.experimental_run_functions_eagerly(True) initialize_tensorflow(gpus=-1)
def test_initialize_tensorflow_with_horovod_bad_local_rank( mock_tf_config, mock_warnings): """In this scenario, the local_size 5 is out of the bounds of the GPU indices.""" mock_tf_config.list_physical_devices.return_value = [ 'gpu0', 'gpu1', 'gpu2', 'gpu3' ] mock_hvd = Mock() mock_hvd.local_rank.return_value = 1 mock_hvd.local_size.return_value = 5 with clean_params(): initialize_tensorflow(horovod=mock_hvd) mock_tf_config.set_visible_devices.assert_called_with([], 'GPU') mock_warnings.warn.assert_called()
def initialize_tensorflow(self, *args, **kwargs): initialize_tensorflow(*args, **kwargs)
def initialize_tensorflow(self, **kwargs): # Make sure we don't claim any GPU resources on the head node initialize_tensorflow(gpus=-1) self._tensorflow_kwargs = kwargs
def initialize_tensorflow(self, *args, **kwargs): initialize_tensorflow(*args, horovod=self._horovod, **kwargs)