def test_trainer_regressor_train_valid_with_multiple_generator_inputs(): import sys from deephyper.benchmark.nas.linearReg.problem import Problem from deephyper.nas.trainer import BaseTrainer from tensorflow.keras.utils import plot_model from deephyper.benchmark.nas.linearRegMultiInputsGen.problem import Problem from deephyper.nas.run._util import get_search_space, load_config, setup_data config = Problem.space load_config(config) input_shape, output_shape = setup_data(config) search_space = get_search_space(config, input_shape, output_shape, 42) config["hyperparameters"]["num_epochs"] = 2 model = search_space.sample() plot_model(model, to_file="trainer_keras_regressor_test.png", show_shapes=True) trainer = BaseTrainer(config=config, model=model) res = trainer.train() assert res != sys.float_info.max
def build_search_space(self, seed=None): """Build and return a search space object using the infered data shapes after loading data. Returns: KSearchSpace: A search space instance. """ config = self.space input_shape, output_shape, _ = setup_data(config, add_to_config=False) search_space = get_search_space(config, input_shape, output_shape, seed=seed) return search_space
def run_distributed_base_trainer(config): physical_devices = tf.config.list_physical_devices("GPU") try: for i in range(len(physical_devices)): tf.config.experimental.set_memory_growth(physical_devices[i], True) except: # Invalid device or cannot modify virtual devices once initialized. pass distributed_strategy = tf.distribute.MirroredStrategy() n_replicas = distributed_strategy.num_replicas_in_sync seed = config["seed"] if seed is not None: np.random.seed(seed) tf.random.set_seed(seed) load_config(config) # Scale batch size and learning rate according to the number of ranks initial_lr = config[a.hyperparameters][a.learning_rate] if config[a.hyperparameters].get("lsr_batch_size"): batch_size = config[a.hyperparameters][a.batch_size] * n_replicas else: batch_size = config[a.hyperparameters][a.batch_size] if config[a.hyperparameters].get("lsr_learning_rate"): learning_rate = config[a.hyperparameters][a.learning_rate] * n_replicas else: learning_rate = config[a.hyperparameters][a.learning_rate] logger.info( f"Scaled: 'batch_size' from {config[a.hyperparameters][a.batch_size]} to {batch_size} " ) logger.info( f"Scaled: 'learning_rate' from {config[a.hyperparameters][a.learning_rate]} to {learning_rate} " ) config[a.hyperparameters][a.batch_size] = batch_size config[a.hyperparameters][a.learning_rate] = learning_rate input_shape, output_shape = setup_data(config) search_space = get_search_space(config, input_shape, output_shape, seed=seed) model_created = False with distributed_strategy.scope(): try: model = search_space.sample(config["arch_seq"]) model_created = True except: logger.info("Error: Model creation failed...") logger.info(traceback.format_exc()) else: # Setup callbacks callbacks = [] cb_requires_valid = False # Callbacks requires validation data callbacks_config = config["hyperparameters"].get("callbacks") if callbacks_config is not None: for cb_name, cb_conf in callbacks_config.items(): if cb_name in default_callbacks_config: default_callbacks_config[cb_name].update(cb_conf) # Special dynamic parameters for callbacks if cb_name == "ModelCheckpoint": default_callbacks_config[cb_name][ "filepath"] = f'best_model_{config["id"]}.h5' # replace patience hyperparameter if "patience" in default_callbacks_config[cb_name]: patience = config["hyperparameters"].get( f"patience_{cb_name}") if patience is not None: default_callbacks_config[cb_name][ "patience"] = patience # Import and create corresponding callback Callback = import_callback(cb_name) callbacks.append( Callback(**default_callbacks_config[cb_name])) if cb_name in ["EarlyStopping"]: cb_requires_valid = "val" in cb_conf[ "monitor"].split("_") else: logger.error( f"'{cb_name}' is not an accepted callback!") # WarmupLR if config[a.hyperparameters].get("warmup_lr"): warmup_epochs = config[a.hyperparameters].get( "warmup_epochs", 5) callbacks.append( LearningRateWarmupCallback( n_replicas=n_replicas, warmup_epochs=warmup_epochs, verbose=0, initial_lr=initial_lr, )) trainer = BaseTrainer(config=config, model=model) trainer.callbacks.extend(callbacks) last_only, with_pred = preproc_trainer(config) last_only = last_only and not cb_requires_valid if model_created: history = trainer.train(with_pred=with_pred, last_only=last_only) # save history save_history(config.get("log_dir", None), history, config) result = compute_objective(config["objective"], history) else: # penalising actions if model cannot be created result = -1 if result < -10 or np.isnan(result): result = -10 return result
def run_horovod(config: dict) -> float: hvd.init() # Threading configuration if os.environ.get("OMP_NUM_THREADS", None) is not None: logger.debug(f"OMP_NUM_THREADS is {os.environ.get('OMP_NUM_THREADS')}") num_intra = int(os.environ.get("OMP_NUM_THREADS")) tf.config.threading.set_intra_op_parallelism_threads(num_intra) tf.config.threading.set_inter_op_parallelism_threads(2) if os.environ.get("CUDA_VISIBLE_DEVICES") is not None: devices = os.environ.get("CUDA_VISIBLE_DEVICES").split(",") os.environ["CUDA_VISIBLE_DEVICES"] = devices[hvd.rank()] config["seed"] seed = config["seed"] if seed is not None: np.random.seed(seed) tf.random.set_seed(seed) load_config(config) # Scale batch size and learning rate according to the number of ranks initial_lr = config[a.hyperparameters][a.learning_rate] batch_size = config[a.hyperparameters][a.batch_size] * hvd.size() learning_rate = config[a.hyperparameters][a.learning_rate] * hvd.size() logger.info( f"Scaled: 'batch_size' from {config[a.hyperparameters][a.batch_size]} to {batch_size} " ) logger.info( f"Scaled: 'learning_rate' from {config[a.hyperparameters][a.learning_rate]} to {learning_rate} " ) config[a.hyperparameters][a.batch_size] = batch_size config[a.hyperparameters][a.learning_rate] = learning_rate input_shape, output_shape = setup_data(config) search_space = get_search_space(config, input_shape, output_shape, seed=seed) # Initialize Horovod model_created = False try: model = search_space.sample(config["arch_seq"]) model_created = True except: logger.info("Error: Model creation failed...") logger.info(traceback.format_exc()) if model_created: # Setup callbacks only callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. #! initial_lr argument is not available in horovod==0.19.0 hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=5, verbose=0, initial_lr=initial_lr ), ] cb_requires_valid = False # Callbacks requires validation data callbacks_config = config[a.hyperparameters].get(a.callbacks, {}) if callbacks_config is not None: for cb_name, cb_conf in callbacks_config.items(): if cb_name in default_callbacks_config: # cb_bame in hvd_root_cb implies hvd.rank() == 0 if not (cb_name in hvd_root_cb) or hvd.rank() == 0: default_callbacks_config[cb_name].update(cb_conf) # Import and create corresponding callback Callback = import_callback(cb_name) callbacks.append(Callback(**default_callbacks_config[cb_name])) if cb_name in ["EarlyStopping"]: cb_requires_valid = "val" in cb_conf["monitor"].split("_") else: logger.error(f"'{cb_name}' is not an accepted callback!") trainer = HorovodTrainer(config=config, model=model) trainer.callbacks.extend(callbacks) last_only, with_pred = preproc_trainer(config) last_only = last_only and not cb_requires_valid history = trainer.train(with_pred=with_pred, last_only=last_only) # save history if hvd.rank() == 0: save_history(config.get("log_dir", None), history, config) result = compute_objective(config["objective"], history) else: # penalising actions if model cannot be created result = -1 if result < -10: result = -10 return result
def run_base_trainer(config): tf.keras.backend.clear_session() # tf.config.optimizer.set_jit(True) # setup history saver if config.get("log_dir") is None: config["log_dir"] = "." save_dir = os.path.join(config["log_dir"], "save") saver = HistorySaver(config, save_dir) saver.write_config() saver.write_model(None) # GPU Configuration if available physical_devices = tf.config.list_physical_devices("GPU") try: for i in range(len(physical_devices)): tf.config.experimental.set_memory_growth(physical_devices[i], True) except: # Invalid device or cannot modify virtual devices once initialized. logger.info("error memory growth for GPU device") # Threading configuration if ( len(physical_devices) == 0 and os.environ.get("OMP_NUM_THREADS", None) is not None ): logger.info(f"OMP_NUM_THREADS is {os.environ.get('OMP_NUM_THREADS')}") num_intra = int(os.environ.get("OMP_NUM_THREADS")) try: tf.config.threading.set_intra_op_parallelism_threads(num_intra) tf.config.threading.set_inter_op_parallelism_threads(2) except RuntimeError: # Session already initialized pass tf.config.set_soft_device_placement(True) seed = config.get("seed") if seed is not None: np.random.seed(seed) tf.random.set_seed(seed) load_config(config) input_shape, output_shape = setup_data(config) search_space = get_search_space(config, input_shape, output_shape, seed=seed) model_created = False try: model = search_space.sample(config["arch_seq"]) model_created = True except: logger.info("Error: Model creation failed...") logger.info(traceback.format_exc()) if model_created: # Setup callbacks callbacks = [] cb_requires_valid = False # Callbacks requires validation data callbacks_config = config["hyperparameters"].get("callbacks") if callbacks_config is not None: for cb_name, cb_conf in callbacks_config.items(): if cb_name in default_callbacks_config: default_callbacks_config[cb_name].update(cb_conf) # Special dynamic parameters for callbacks if cb_name == "ModelCheckpoint": default_callbacks_config[cb_name]["filepath"] = saver.model_path # replace patience hyperparameter if "patience" in default_callbacks_config[cb_name]: patience = config["hyperparameters"].get(f"patience_{cb_name}") if patience is not None: default_callbacks_config[cb_name]["patience"] = patience # Import and create corresponding callback Callback = import_callback(cb_name) callbacks.append(Callback(**default_callbacks_config[cb_name])) if cb_name in ["EarlyStopping"]: cb_requires_valid = "val" in cb_conf["monitor"].split("_") else: logger.error(f"'{cb_name}' is not an accepted callback!") trainer = BaseTrainer(config=config, model=model) trainer.callbacks.extend(callbacks) last_only, with_pred = preproc_trainer(config) last_only = last_only and not cb_requires_valid history = trainer.train(with_pred=with_pred, last_only=last_only) # save history saver.write_history(history) result = compute_objective(config["objective"], history) else: # penalising actions if model cannot be created logger.info("Model could not be created returning -Inf!") result = -float("inf") if np.isnan(result): logger.info("Computed objective is NaN returning -Inf instead!") result = -float("inf") return result