def trainable(config, name_fmt, envname, trainingconfig, evaluate_mean_n): # Parse arguments trial_dir = Path(tune.get_trial_dir()) if tune.get_trial_dir( ) is not None else Path.cwd() adv_force = config["adv_force"] name = name_fmt.format(adv_force=adv_force) cmd_args = [ '--name', name, '--env', envname, '--log', '--trainingconfig', str(trainingconfig), '--root', str(trial_dir), '--monitor-dir', str(monitor_dir_name(envname, adv_force)) ] cmd_args += ['--adv_force', str(adv_force)] args = parse_args(cmd_args) # Add adversarial force logging.info(f'Running {name=} with {args=}') def evaluate(prot, ts): # reward = get_mean_reward_last_n_steps(evaluate_mean_n, args.monitor_dir) # logging.info(f'{name} {reward=:.2f} {ts=}') # tune.report(reward=reward) robustness = eval_robustness(args, prot, envname, trainingconfig, name) logging.info(f'{name} {robustness=:.2f} {ts=}') tune.report(robustness=robustness) run(args, evaluate_fn=evaluate)
def train_submodel_diff(config): from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from keras.layers import Embedding from keras.callbacks import ModelCheckpoint from keras.optimizers import Adam from ray.tune.integration.keras import TuneReporterCallback import utils.definition_network as dn import pandas as pd from ray import tune x_train, y_train, x_valid, y_valid, num_words, embedding_matrix = config["exp_sets"].pp_data.load_data() trainable_emb = (config["exp_sets"].pp_data.use_embedding == (dn.UseEmbedding.RAND or dn.UseEmbedding.NON_STATIC)) layers_model = [Embedding(config["exp_sets"].pp_data.vocabulary_size, config["exp_sets"].pp_data.embedding_size, trainable=trainable_emb, name=config["name"]+'_rt_emb_1')] for id_hl in range(config["hidden_layers"]-1): layers_model.append(LSTM(config["lstm_units"], kernel_initializer='lecun_uniform', activation='tanh', dropout=config["dropout_lstm"], recurrent_dropout=config["dropout_lstm"], return_sequences=True, name=config["name"]+'_rt_lstm_'+str(id_hl))) layers_model.append(LSTM(config["lstm_units"], kernel_initializer='lecun_uniform', activation='tanh', dropout=config["dropout_lstm"], recurrent_dropout=config["dropout_lstm"], name=config["name"]+'_rt_lstm_'+str(id_hl+1))) layers_model.append(Dense(3, activation='sigmoid', name=config["name"]+'_rt_dense_1')) model = Sequential(layers_model) model.compile(loss="binary_crossentropy", optimizer=Adam(lr=config["lr"]), metrics=["accuracy"]) history = model.fit(x_train, y_train, batch_size=config["batch_size"], epochs=config["epochs"], verbose=0, validation_data=(x_valid, y_valid), callbacks=[TuneReporterCallback(freq="epoch"), ModelCheckpoint(tune.get_trial_dir() + 'train_model.h5', monitor='val_acc', mode='max', save_best_only=True, save_weights_only=False, verbose=0)]) hist_df = pd.DataFrame(history.history) with open(tune.get_trial_dir() + 'history_train_model.csv', mode='w') as file: hist_df.to_csv(file)
def tune_train_once(config, checkpoint_dir=None, args: argparse.Namespace = None, model_class: type = None, build_method=None, task_info: TaskInfo = None, model_kwargs: dict = None, resume: str = None, **kwargs): if resume is None: resume = 'all' args_vars = vars(args) args_vars.update(config) pl.seed_everything(args.seed) logger = [ loggers.CSVLogger(save_dir=tune.get_trial_dir(), name="", version="."), loggers.TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".", default_hp_metric=False) ] trainer_args = dict(logger=logger, progress_bar_refresh_rate=0, callbacks=[ TuneReportCheckpointCallback( metrics={ f'tune_{task_info.metric_name}': f'val_{task_info.metric_name}' }, filename="tune.ckpt", on="validation_end") ]) if checkpoint_dir and resume == 'all': trainer_args['resume_from_checkpoint'] = os.path.join( checkpoint_dir, "tune.ckpt") # fix slurm trainer os.environ["SLURM_JOB_NAME"] = "bash" model = model_class(args, **model_kwargs) build_method(model, task_info) trainer: Trainer = Trainer.from_argparse_args(args, **trainer_args) if checkpoint_dir and resume == 'model': ckpt = pl_load(os.path.join(checkpoint_dir, "tune.ckpt"), map_location=lambda storage, loc: storage) model = model._load_model_state(ckpt) trainer.current_epoch = ckpt["epoch"] trainer.fit(model)
def train_tune(hparams, rdm): model = get_model(hparams) logger = TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".", default_hp_metric=False) logger.log_hyperparams( hparams, { 'train_acc': 0, 'train_f1': 0, 'train_loss': 0, 'valid_acc': 0, 'valid_f1': 0, 'valid_loss': 0, }) trainer = pl.Trainer(max_epochs=hparams['n_epochs'], gpus=1, logger=logger, progress_bar_refresh_rate=0, callbacks=[ TuneReportCallback( ['valid_acc', 'valid_f1', 'valid_loss'], on="validation_end") ]) trainer.fit(model, rdm)
def tune_main(hparams, num_epochs=15, num_gpus=0): print(hparams) mean, std, traindir, valdir, num_classes = choose_dataset('cifar10') traindir = '/home/jovyan/work/cv_data/cifar10/train' valdir = '/home/jovyan/work/cv_data/cifar10/test' hparams['num_classes'] = num_classes train_logger.info('Training Directory: {0}'.format(traindir) ) model = LightningModel(hparams) trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, #distributed_backend=hparams.distributed_backend, precision=32, #early_stop_callback=early_stop_callback, logger=TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[ TuneReportCallback( { "loss": "val_loss_epoch", "accuracy": "val_acc_epoch" }, on="validation_end" ) ]) normal_pipe = BasicPipe(hparams, traindir, valdir, mean, std) trainer.fit(model, normal_pipe)
def experiment(config): iterations = config.pop("train-iterations") train_agent = ppo.PPOTrainer(config=config, env="CartPole-v0") checkpoint = None train_results = {} # Train for i in range(iterations): train_results = train_agent.train() if i % 2 == 0 or i == iterations - 1: checkpoint = train_agent.save(tune.get_trial_dir()) tune.report(**train_results) train_agent.stop() # Manual Eval config["num_workers"] = 0 eval_agent = ppo.PPOTrainer(config=config, env="CartPole-v0") eval_agent.restore(checkpoint) env = eval_agent.workers.local_worker().env obs = env.reset() done = False eval_results = {"eval_reward": 0, "eval_eps_length": 0} while not done: action = eval_agent.compute_action(obs) next_obs, reward, done, info = env.step(action) eval_results["eval_reward"] += reward eval_results["eval_eps_length"] += 1 results = {**train_results, **eval_results} tune.report(results)
def train_mnist_tune_checkpoint(config, checkpoint_dir=None, num_epochs=10, num_gpus=0, data_dir="~/data"): data_dir = os.path.expanduser(data_dir) kwargs = { "max_epochs": num_epochs, # If fractional GPUs passed in, convert to int. "gpus": math.ceil(num_gpus), "logger": TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), "progress_bar_refresh_rate": 0, "callbacks": [ TuneReportCheckpointCallback( metrics={ "loss": "ptl/val_loss", "mean_accuracy": "ptl/val_accuracy" }, filename="checkpoint", on="validation_end") ] } if checkpoint_dir: kwargs["resume_from_checkpoint"] = os.path.join( checkpoint_dir, "checkpoint") model = LightningMNISTClassifier(config=config, data_dir=data_dir) trainer = pl.Trainer(**kwargs) trainer.fit(model)
def train_mnist_tune_checkpoint( config, checkpoint_dir=None, data_dir=None, num_epochs=10, num_gpus=0): trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, logger=TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[CheckpointCallback(), TuneReportCallback()]) if checkpoint_dir: # Currently, this leads to errors: # model = LightningMNISTClassifier.load_from_checkpoint( # os.path.join(checkpoint, "checkpoint")) # Workaround: ckpt = pl_load( os.path.join(checkpoint_dir, "checkpoint"), map_location=lambda storage, loc: storage) model = LightningMNISTClassifier._load_model_state(ckpt, config=config) trainer.current_epoch = ckpt["epoch"] else: model = LightningMNISTClassifier( config=config, data_dir=data_dir) trainer.fit(model)
def report(progress_tracker): # The progress tracker's metrics are nested dictionaries of TrainerMetrics: feature_name -> metric_name -> # List[TrainerMetric], with one entry per training checkpoint, according to steps_per_checkpoint. # We reduce the dictionary of TrainerMetrics to a simple list of floats for interfacing with Ray Tune. train_stats = { TRAINING: metric_utils.reduce_trainer_metrics_dict( progress_tracker.train_metrics), VALIDATION: metric_utils.reduce_trainer_metrics_dict( progress_tracker.validation_metrics), TEST: metric_utils.reduce_trainer_metrics_dict( progress_tracker.test_metrics), } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )
def on_epoch_end(self, trainer, progress_tracker, save_path): with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join(checkpoint_dir, 'model') # shutil.copytree(save_path, checkpoint_model) # Note: A previous implementation used shutil.copytree() # however, this copying method is non atomic if not os.path.isdir(checkpoint_model): copy_id = uuid.uuid4() tmp_dst = "%s.%s.tmp" % (checkpoint_model, copy_id) shutil.copytree(save_path, tmp_dst) try: os.rename(tmp_dst, checkpoint_model) except: shutil.rmtree(tmp_dst) train_stats = { TRAINING: progress_tracker.train_metrics, VALIDATION: progress_tracker.vali_metrics, TEST: progress_tracker.test_metrics, } metric_score = tune_executor.get_metric_score( train_stats, eval_stats=None) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps( train_stats[TRAINING], cls=NumpyEncoder), eval_stats=json.dumps( train_stats[VALIDATION], cls=NumpyEncoder), trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir() )
def train_t(config): seed = config.pop('seed') static_params = config.pop('static_params') torch.backends.cudnn.enabled = True if static_params['t_id'] == 0: torch.backends.cudnn.deterministic = True torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) else: torch.backends.cudnn.deterministic = False if 'PSSN' in tune.get_trial_name() or static_params['t_id'] == 0: torch.backends.cudnn.benchmark = False else: torch.backends.cudnn.benchmark = True if 'learner' in config: learner = config.pop('learner') else: learner_path = config.pop('learner_path') learner = torch.load(learner_path) rescaled, t, metrics, b_state_dict, stats = train_single_task(config=config, learner=learner, **static_params) learner_save_path = os.path.join(tune.get_trial_dir(), 'learner.pth') # raise ValueError(learner_save_path) torch.save(learner, learner_save_path)
def train_tune(config, epochs, resources, checkpoint_dir=None): # viz logger logger = TensorBoardLogger(save_dir=tune.get_trial_dir(), name=model_name) # metric reporter + checkpoint callback callback = TuneReportCheckpointCallback( metrics=pbt_config['metrics_to_report']) # search trainer object trainer = pl.Trainer( max_epochs=epochs, gpus=resources['gpu'], logger=logger, callbacks=[callback], progress_bar_refresh_rate=50, precision=16, ) # checkpointing system if checkpoint_dir: model = network.load_from_checkpoint( os.path.join(checkpoint_dir, 'checkpoint')) else: model = network(config) # fits model/data module with current hyperparameter set data_module = dm(config) trainer.fit(model, datamodule=data_module)
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0): model = LightningMNISTClassifier(config, data_dir) trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, logger=TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[TuneReportCallback()]) trainer.fit(model)
def _train_fn(self, config: Dict, checkpoint_dir=None, fast_dev_run=False, include_gpus=False): utils.hprint('Starting train function with config:') utils.print_dict(config) print() utils.set_pandas_disp(width=200) hp = self._model_param_class.from_dict(config) assert isinstance(hp, self._model_param_class) print(' hp:', hp) if checkpoint_dir: # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing raise NotImplementedError( f"Got checkpoint_dir in trian_fn: {checkpoint_dir}") utils.hprint("About to create net in TuneRunner") net = hp.build() # import torch.autograd.profiler as profiler # with profiler.profile(record_shapes=True, use_cuda=True, profile_memory=True) as prof: # net = self._factored_lightning_module_class.from_hp(hp=hp) # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=1000)) utils.set_seeds(hp.data.seed) # noinspection PyTypeChecker trainer = pl.Trainer( logger=logs_mod.get_pl_logger(hp=hp.exp, tune=tune), default_root_dir=tune.get_trial_dir(), callbacks=self.extra_pl_callbacks + self.get_pl_callbacks_for_tune(), max_epochs=hp.opt.num_epochs, gpus=hp.data.num_gpus if include_gpus else None, weights_summary='full', fast_dev_run=fast_dev_run, accumulate_grad_batches=1, profiler='simple', deterministic=True, log_every_n_steps=hp.logs.num_steps_per_metric_log, log_gpu_memory=hp.logs.log_gpu_memory, ) utils.hprint('About to start tune_runner\'s trainer.fit...') fit_out = trainer.fit(net, datamodule=net.dm) utils.hprint('Done with tune_runner._train_fn') return fit_out
def clip_fine_tune( config, num_epochs, num_gpus, dataset: pa.Table, init_config: CLIPConfig, init_state_dict: dict, processor: CLIPProcessor, ): if "SLURM_NTASKS" in os.environ: del os.environ["SLURM_NTASKS"] if "SLURM_JOB_NAME" in os.environ: del os.environ["SLURM_JOB_NAME"] bird_dataset = dataset data_mod = MultiModalDataModule( dataset=bird_dataset, processor=processor, test_size=config["test_size"], batch_size=config["batch_size"], val_batch_size=config["val_batch_size"], num_workers=config["num_workers"], ) clip_model = CLIPModel(init_config) clip_model.load_state_dict(init_state_dict) model = CLIPFineTunedModel(clip_model, **config) tune_cbs = [ TuneReportCheckpointCallback(["val_loss"], on="validation_end") ] logger = TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".") trainer = pl.Trainer( logger=logger, num_sanity_val_steps=0, max_epochs=num_epochs, gpus=math.ceil(num_gpus), progress_bar_refresh_rate=0, log_every_n_steps=1, callbacks=[LearningRateMonitor(logging_interval="step")] + tune_cbs, ) trainer.validate(model, data_mod) trainer.fit(model, data_mod) return trainer
def train_transformer(config, checkpoint_dir=None): data_args = DataTrainingArguments(task_name=config["task_name"], data_dir=config["data_dir"]) tokenizer = AutoTokenizer.from_pretrained(config["model_name"]) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train", cache_dir=config["data_dir"]) eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=config["data_dir"]) eval_dataset = eval_dataset[:len(eval_dataset) // 2] training_args = TrainingArguments( output_dir=tune.get_trial_dir(), learning_rate=config["learning_rate"], do_train=True, do_eval=True, evaluate_during_training=True, eval_steps=(len(train_dataset) // config["per_gpu_train_batch_size"]) + 1, # We explicitly set save to 0, and do saving in evaluate instead save_steps=0, num_train_epochs=config["num_epochs"], max_steps=config["max_steps"], per_device_train_batch_size=config["per_gpu_train_batch_size"], per_device_eval_batch_size=config["per_gpu_val_batch_size"], warmup_steps=0, weight_decay=config["weight_decay"], logging_dir="./logs", ) # Arguments for W&B. name = tune.get_trial_name() wandb_args = { "project_name": "transformers_pbt", "watch": "false", # Either set to gradient, false, or all "run_name": name, } tune_trainer = get_trainer(recover_checkpoint(checkpoint_dir, config["model_name"]), train_dataset, eval_dataset, config["task_name"], training_args, wandb_args=wandb_args) tune_trainer.train(recover_checkpoint(checkpoint_dir, config["model_name"]))
def experiment(config): #global unused_shared_step #global unused_own_step #global unsatisfied_shared_step #global unsatisfied_own_step iterations = 2 train_agent = ppo.PPOTrainer(config=config, env=ContentCaching)#"ContentCaching-v0") checkpoint = None train_results = {} # Train #iterations = 20 for i in range(iterations): train_results = train_agent.train() if i % 2 == 0 or i == iterations - 1: checkpoint = train_agent.save(tune.get_trial_dir()) tune.report(**train_results) train_agent.stop() # Manual Eval config["num_workers"] = 0 eval_agent = ppo.PPOTrainer(config=config, env=ContentCaching)#"ContentCaching-v0") eval_agent.restore(checkpoint) env = eval_agent.workers.local_worker().env obs = env.reset() done = False eval_results = {"eval_reward": 0, "eval_eps_length": 0} while not done: action = eval_agent.compute_action(obs) next_obs, reward, done, info, = env.step(action) #unused_shared_step.append(info["unused_shared"]) #unused_own_step.append(info["unused_own"]) #unsatisfied_shared_step.append(info["unsatisfied_shared"]) global unsatisfied_own_step unsatisfied_own_step= 99#.append(10)#info["unused_own"]) print(" info[unused_shared] =xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx ", info["unused_shared"] ) eval_results["eval_reward"] += reward eval_results["eval_eps_length"] += 1 results = {**train_results, **eval_results} tune.report(results)
def report(progress_tracker): train_stats = { TRAINING: progress_tracker.train_metrics, VALIDATION: progress_tracker.vali_metrics, TEST: progress_tracker.test_metrics, } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )
def _setup_wandb(self): if self.is_world_master() and self.wandb_args is not None: wandb.init(project=self.wandb_args["project_name"], name=self.wandb_args["run_name"], id=self.wandb_args["run_name"], dir=tune.get_trial_dir(), config=vars(self.args), reinit=True, allow_val_change=True, resume=self.wandb_args["run_name"]) # keep track of model topology and gradients, unsupported on TPU if not is_torch_tpu_available( ) and self.wandb_args["watch"] != "false": wandb.watch(self.model, log=self.wandb_args["watch"], log_freq=max(100, self.args.logging_steps))
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0): model = LightningMNISTClassifier(config, data_dir) trainer = pl.Trainer( max_epochs=num_epochs, # If fractional GPUs passed in, convert to int. gpus=math.ceil(num_gpus), logger=TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[ TuneReportCallback( { "loss": "ptl/val_loss", "mean_accuracy": "ptl/val_accuracy" }, on="validation_end") ]) trainer.fit(model)
def trainWithTune(config, checkpoint_dir=None, datamodule=None, num_epochs=10, num_gpus=0): trainer = Trainer( max_epochs=num_epochs, # If fractional GPUs passed in, convert to int. gpus=math.ceil(num_gpus), logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[ TuneReportCheckpointCallback(metrics={ "loss": "val_loss", "mean_accuracy": "val_acc", "mean_iou": "val_iou", }, filename="checkpoint", on="validation_end") ]) if checkpoint_dir: # Currently, this leads to errors: # model = LightningMNISTClassifier.load_from_checkpoint( # os.path.join(checkpoint, "checkpoint")) # Workaround: ckpt = pl_load(os.path.join(checkpoint_dir, "checkpoint"), map_location=lambda storage, loc: storage) model = MMETrainingModule._load_model_state( ckpt, lr=10**config['log_lr'], lrRatio=10**config['log_lrRatio'], decay=10**config['log_decay'], num_cls=NUM_CLS) trainer.current_epoch = ckpt["epoch"] else: model = MMETrainingModule(lr=10**config['log_lr'], lrRatio=10**config['log_lrRatio'], decay=10**config['log_decay'], num_cls=NUM_CLS) trainer.fit(model, datamodule=datamodule)
def _train_fn(self, config: Dict, checkpoint_dir=None, fast_dev_run=False, include_gpus=False): utils.hprint('Starting train function with config:') utils.print_dict(config) del config['tune'] hp = self._param_class.from_dict(config) assert isinstance(hp, self._param_class) if checkpoint_dir: # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing raise NotImplementedError( f"Got checkpoint_dir in trian_fn: {checkpoint_dir}") net = self._factored_lightning_module_class.from_hp(hp=hp) utils.set_seeds(hp.data.seed) # noinspection PyTypeChecker trainer = pl.Trainer( logger=torch_mod.get_pl_logger(hp=hp.exp, tune=tune, offline_mode=fast_dev_run), default_root_dir=tune.get_trial_dir(), callbacks=self.extra_pl_callbacks + self.get_tune_callbacks(), max_epochs=hp.opt.num_epochs, gpus=hp.data.num_gpus if include_gpus else None, weights_summary='full', fast_dev_run=fast_dev_run, accumulate_grad_batches=1, profiler='simple', deterministic=True, log_every_n_steps=hp.metrics.num_steps_per_metric_log, ) fit_out = trainer.fit(net, datamodule=net.dm) utils.print_dict(config) utils.hprint('Done with tune_runner._train_fn') return fit_out
def train_mnist_tune(tuning_config, data_dir=None, num_epochs=10, num_gpus=0): # Only Training model = LightningMNISTClassifier(tuning_config, data_dir) # =============================================================================== # Callback # ===============================================================================\ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.callbacks import EarlyStopping early_stop_cb = EarlyStopping(monitor='ptl/val_loss', patience=5, verbose=True, mode='min') ckpt_cb = ModelCheckpoint(tune.get_trial_dir() + '/checkpoints', save_top_k=5, verbose=True, monitor='ptl/val_loss', mode='min', save_last=True, filename='model_{epoch:03d}-{step}') tune_rp_cb = TuneReportCallback( { "val_loss": "ptl/val_loss", "val_accuracy": "ptl/val_accuracy" }, on="validation_end") # =============================================================================== # Trainer # Note: Must set logger as default with # =============================================================================== trainer = pl.Trainer( progress_bar_refresh_rate=0, # 0 means no print progress max_epochs=num_epochs, # If fractional GPUs passed in, convert to int. gpus=math.ceil(num_gpus), callbacks=[ckpt_cb, tune_rp_cb, early_stop_cb]) trainer.logger._default_hp_metric = False # hp_metrc must be False trainer.fit(model)
def worker_function(inner_ex_config, config): """ Combines experiment config and auto-generated Ray config, and runs an iteration of inner_ex on that combined config. :param inner_ex_config: The current values of inner experiment config, including any modifications we might have made in an macro_experiment config update :param config: Config generated by Ray tune :return: """ from inner_experiment import inner_ex # Something that runs inner_ex by combining "base" config and ray experiment config inner_ex_dict = dict(inner_ex_config) merged_config = update(inner_ex_dict, config) # This will create an observer in the Tune trial directory, meaning that # inner experiment configs will be saved at <trial.log_dir>/1 observer = FileStorageObserver.create(tune.get_trial_dir()) inner_ex.observers.append(observer) ret_val = inner_ex.run(config_updates=merged_config) tune.report(accuracy=ret_val.result)
def train(config, batch_size, num_epochs=20, num_gpus=0): training = dl.loader(55000, batch_size, 0) validation = dl.loader(8250, 1, 55000) cae = ContractiveAutoEncoder(training_dataloader=training, val_dataloader=validation, config=config) trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, auto_select_gpus=True if num_gpus else False, logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version='.'), stochastic_weight_avg=True, benchmark=True, callbacks=[ TuneReportCheckpointCallback({"loss": "val_loss"}, filename="checkpoint", on="validation_end") ]) trainer.fit(cae)
def train_libmultilabel_tune(config, datasets, classes, word_dict): """The training function for ray tune. Args: config (AttributeDict): Config of the experiment. datasets (dict): A dictionary of datasets. classes(list): List of class names. word_dict(torchtext.vocab.Vocab): A vocab object which maps tokens to indices. """ set_seed(seed=config.seed) config.run_name = tune.get_trial_dir() logging.info(f'Run name: {config.run_name}') config.checkpoint_dir = os.path.join(config.result_dir, config.run_name) config.log_path = os.path.join(config.checkpoint_dir, 'logs.json') trainer = TorchTrainer(config=config, datasets=datasets, classes=classes, word_dict=word_dict, search_params=True, save_checkpoints=False) trainer.train()
def run_parameterised_experiment(config): # Hyperparameters trial_dir = tune.get_trial_dir() problem, method, other_config = config["main_params"] n_workers = config["n_workers"] experiment = CartpoleExperiment() experiment.nn_path = other_config[ "folder"] # nn_paths_cartpole[other_config["nn_path"]] experiment.tau = other_config["tau"] if other_config["template"] == 2: # octagon experiment.analysis_template = Experiment.octagon( experiment.env_input_size) elif other_config["template"] == 0: # box experiment.analysis_template = Experiment.box( experiment.env_input_size) else: _, template = experiment.get_template(1) experiment.analysis_template = template # standard experiment.n_workers = n_workers experiment.show_progressbar = False experiment.show_progress_plot = False # experiment.use_rounding = False experiment.save_dir = trial_dir experiment.update_progress_fn = update_progress elapsed_seconds, safe, max_t = experiment.run_experiment() safe_value = 0 if safe is None: safe_value = 0 elif safe: safe_value = 1 elif not safe: safe_value = -1 tune.report(elapsed_seconds=elapsed_seconds, safe=safe_value, max_t=max_t, done=True)
def train_mnist_tune_checkpoint(config, checkpoint_dir=None, data_dir=None, num_epochs=10, num_gpus=0): trainer = pl.Trainer( max_epochs=num_epochs, # If fractional GPUs passed in, convert to int. gpus=math.ceil(num_gpus), logger=TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[ TuneReportCheckpointCallback( metrics={ "loss": "ptl/val_loss", "mean_accuracy": "ptl/val_accuracy" }, filename="checkpoint", on="validation_end") ]) if checkpoint_dir: # Currently, this leads to errors: # model = LightningMNISTClassifier.load_from_checkpoint( # os.path.join(checkpoint, "checkpoint")) # Workaround: ckpt = pl_load( os.path.join(checkpoint_dir, "checkpoint"), map_location=lambda storage, loc: storage) model = LightningMNISTClassifier._load_model_state( ckpt, config=config, data_dir=data_dir) trainer.current_epoch = ckpt["epoch"] else: model = LightningMNISTClassifier(config=config, data_dir=data_dir) trainer.fit(model)
def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx, is_using_ray_backend=False): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = RayTuneSampler.decode_values(config, decode_ctx) trial_id = tune.get_trial_id() modified_config = substitute_parameters(copy.deepcopy(hyperopt_dict["config"]), config) trial_dir = Path(tune.get_trial_dir()) trial_location = ray.util.get_node_ip_address() hyperopt_dict["config"] = modified_config hyperopt_dict["experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' hyperopt_dict["output_directory"] = str(trial_dir) tune_executor = self if is_using_ray_backend: ray_queue = RayQueue(actor_options={"num_cpus": 0}) else: ray_queue = None def checkpoint(progress_tracker, save_path): with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join(checkpoint_dir, "model") # shutil.copytree(save_path, checkpoint_model) # Note: A previous implementation used shutil.copytree() # however, this copying method is non atomic if not os.path.isdir(checkpoint_model): copy_id = uuid.uuid4() tmp_dst = f"{checkpoint_model}.{copy_id}.tmp" assert os.path.exists(save_path) shutil.copytree(save_path, tmp_dst) try: os.rename(tmp_dst, checkpoint_model) except Exception: shutil.rmtree(tmp_dst) def report(progress_tracker): train_stats = { TRAINING: progress_tracker.train_metrics, VALIDATION: progress_tracker.vali_metrics, TEST: progress_tracker.test_metrics, } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), ) class RayTuneReportCallback(Callback): def _get_sync_client_and_remote_checkpoint_dir(self) -> Optional[Tuple["CommandBasedClient", str]]: # sync client has to be recreated to avoid issues with serialization return tune_executor._get_sync_client_and_remote_checkpoint_dir(trial_dir) def on_trainer_train_setup(self, trainer, save_path, is_coordinator): if is_using_ray_backend and checkpoint_dir and trial_location != ray.util.get_node_ip_address(): save_path = Path(save_path) for path in trial_dir.glob("checkpoint*"): if path not in (save_path.parent, checkpoint_dir): shutil.rmtree(path, ignore_errors=True) sync_info = self._get_sync_client_and_remote_checkpoint_dir() if sync_info is not None: sync_client, remote_checkpoint_dir = sync_info sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait() def on_epoch_end(self, trainer, progress_tracker, save_path): if is_using_ray_backend: save_path = Path(save_path) if trial_location != ray.util.get_node_ip_address(): sync_info = self._get_sync_client_and_remote_checkpoint_dir() if sync_info is not None: sync_client, remote_checkpoint_dir = sync_info sync_client.sync_up(str(save_path.parent.parent.absolute()), remote_checkpoint_dir) sync_client.wait() ray_queue.put((progress_tracker, str(save_path))) return checkpoint(progress_tracker, save_path) report(progress_tracker) callbacks = hyperopt_dict.get("callbacks") or [] hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()] # set tune resources if is_using_ray_backend: resources = tune.get_trial_resources() # check if we are using at least 1 gpu per trial use_gpu = bool(self._gpu_resources_per_trial_non_none) # get the resources assigned to the current trial current_resources = resources.required_resources["GPU" if use_gpu else "CPU"] hvd_kwargs = { "num_workers": int(current_resources), "use_gpu": use_gpu, } hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs) logger.debug(f"Trial horovod kwargs: {hvd_kwargs}") stats = [] def _run(): train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, parameters=config, ) stats.append((train_stats, eval_stats)) sync_info = self._get_sync_client_and_remote_checkpoint_dir(trial_dir) if is_using_ray_backend and sync_info is not None: # We have to pull the results to the trial actor # from worker actors, as the Tune session is running # only on the trial actor thread = threading.Thread(target=_run) thread.daemon = True thread.start() sync_client, remote_checkpoint_dir = sync_info def check_queue(): qsize = ray_queue.qsize() if qsize: results = ray_queue.get_nowait_batch(qsize) sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait() for progress_tracker, save_path in results: checkpoint(progress_tracker, str(trial_dir.joinpath(Path(save_path)))) report(progress_tracker) while thread.is_alive(): thread.join(timeout=0) check_queue() time.sleep(0.1) thread.join() check_queue() else: # remove threading overhead _run() if not stats: raise RuntimeError("Experiment did not complete.") train_stats, eval_stats = stats.pop() metric_score = self.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder), trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )
def train_submodel_diff(config): from keras.models import Sequential from keras.layers import Dense, Dropout from keras.layers import Conv1D, MaxPooling1D, LSTM from keras.layers import Embedding from keras.callbacks import ModelCheckpoint from keras.optimizers import adadelta from ray.tune.integration.keras import TuneReporterCallback import utils.definition_network as dn import pandas as pd from ray import tune x_train, y_train, x_valid, y_valid, num_words, embedding_matrix = config[ "exp_sets"].pp_data.load_data() trainable_emb = (config["exp_sets"].pp_data.use_embedding == ( dn.UseEmbedding.RAND or dn.UseEmbedding.NON_STATIC)) model = Sequential([ Embedding(config["exp_sets"].pp_data.vocabulary_size, config["exp_sets"].pp_data.embedding_size, trainable=trainable_emb, name=config["name"] + '_rt_emb_1'), Dropout(config["dropout"], name=config["name"] + '_rt_dropout_1'), Conv1D(filters=config["filters_by_layer"], kernel_size=config["kernels_size"], kernel_initializer='glorot_uniform', padding='valid', activation='relu', name=config["name"] + '_rt_conv_1'), MaxPooling1D(name=config["name"] + '_rt_max_pool_1'), LSTM(config["lstm_units"], kernel_initializer='glorot_uniform', activation='tanh', dropout=config["dropout_lstm"], recurrent_dropout=config["dropout_lstm"], return_sequences=True, name=config["name"] + '_rt_lstm_1'), LSTM(config["lstm_units"], kernel_initializer='glorot_uniform', activation='tanh', dropout=config["dropout_lstm"], recurrent_dropout=config["dropout_lstm"], return_sequences=True, name=config["name"] + '_rt_lstm_2'), LSTM(config["lstm_units"], kernel_initializer='glorot_uniform', activation='tanh', dropout=config["dropout_lstm"], recurrent_dropout=config["dropout_lstm"], name=config["name"] + '_rt_lstm_3'), Dense(3, activation='sigmoid', name=config["name"] + '_rt_dense_1') ]) model.compile(loss="binary_crossentropy", optimizer=adadelta(lr=config["lr"]), metrics=["accuracy"]) history = model.fit(x_train, y_train, batch_size=config["batch_size"], epochs=config["epochs"], verbose=0, validation_data=(x_valid, y_valid), callbacks=[ TuneReporterCallback(freq="epoch"), ModelCheckpoint(tune.get_trial_dir() + 'train_model.h5', monitor='val_acc', mode='max', save_best_only=True, save_weights_only=False, verbose=0) ]) hist_df = pd.DataFrame(history.history) with open(tune.get_trial_dir() + 'history_train_model.csv', mode='w') as file: hist_df.to_csv(file)