def report(progress_tracker): # The progress tracker's metrics are nested dictionaries of TrainerMetrics: feature_name -> metric_name -> # List[TrainerMetric], with one entry per training checkpoint, according to steps_per_checkpoint. # We reduce the dictionary of TrainerMetrics to a simple list of floats for interfacing with Ray Tune. train_stats = { TRAINING: metric_utils.reduce_trainer_metrics_dict( progress_tracker.train_metrics), VALIDATION: metric_utils.reduce_trainer_metrics_dict( progress_tracker.validation_metrics), TEST: metric_utils.reduce_trainer_metrics_dict( progress_tracker.test_metrics), } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )
def on_epoch_end(self, trainer, progress_tracker, save_path): with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join(checkpoint_dir, 'model') # shutil.copytree(save_path, checkpoint_model) # Note: A previous implementation used shutil.copytree() # however, this copying method is non atomic if not os.path.isdir(checkpoint_model): copy_id = uuid.uuid4() tmp_dst = "%s.%s.tmp" % (checkpoint_model, copy_id) shutil.copytree(save_path, tmp_dst) try: os.rename(tmp_dst, checkpoint_model) except: shutil.rmtree(tmp_dst) train_stats = { TRAINING: progress_tracker.train_metrics, VALIDATION: progress_tracker.vali_metrics, TEST: progress_tracker.test_metrics, } metric_score = tune_executor.get_metric_score( train_stats, eval_stats=None) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps( train_stats[TRAINING], cls=NumpyEncoder), eval_stats=json.dumps( train_stats[VALIDATION], cls=NumpyEncoder), trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir() )
def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = RayTuneSampler.decode_values(config, decode_ctx) trial_id = tune.get_trial_id() modified_config = substitute_parameters( copy.deepcopy(hyperopt_dict["config"]), config) hyperopt_dict['config'] = modified_config hyperopt_dict[ 'experiment_name '] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' tune_executor = self class RayTuneReportCallback(Callback): def on_epoch_end(self, trainer, progress_tracker, save_path): if trainer.is_coordinator(): with tune.checkpoint_dir( step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join( checkpoint_dir, 'model') shutil.copytree(save_path, checkpoint_model) train_stats, eval_stats = progress_tracker.train_metrics, progress_tracker.vali_metrics stats = eval_stats or train_stats metric_score = tune_executor.get_metric_score_from_eval_stats( stats)[-1] tune.report(parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder)) train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, callbacks=[RayTuneReportCallback()], ) metric_score = self.get_metric_score(train_stats, eval_stats) tune.report(parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder))
def report(progress_tracker): train_stats = { TRAINING: progress_tracker.train_metrics, VALIDATION: progress_tracker.vali_metrics, TEST: progress_tracker.test_metrics, } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )
def _run_experiment(self, config, hyperopt_dict): trial_id = tune.get_trial_id() gpus_ids = ray.get_gpu_ids() if gpus_ids: gpus = ",".join(str(id) for id in gpus_ids) else: gpus = None modified_config = substitute_parameters( copy.deepcopy(hyperopt_dict["config"]), config) hyperopt_dict["config"] = modified_config hyperopt_dict[ "experiment_name"] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' hyperopt_dict["gpus"] = gpus train_stats, eval_stats = run_experiment(**hyperopt_dict) metric_score = self.get_metric_score(train_stats, eval_stats) tune.report(parameters=str(config), metric_score=metric_score, training_stats=str(train_stats), eval_stats=str(eval_stats))
def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx, is_using_ray_backend=False): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = RayTuneSampler.decode_values(config, decode_ctx) trial_id = tune.get_trial_id() modified_config = substitute_parameters(copy.deepcopy(hyperopt_dict["config"]), config) trial_dir = Path(tune.get_trial_dir()) trial_location = ray.util.get_node_ip_address() hyperopt_dict["config"] = modified_config hyperopt_dict["experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' hyperopt_dict["output_directory"] = str(trial_dir) tune_executor = self if is_using_ray_backend: ray_queue = RayQueue(actor_options={"num_cpus": 0}) else: ray_queue = None def checkpoint(progress_tracker, save_path): with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join(checkpoint_dir, "model") # shutil.copytree(save_path, checkpoint_model) # Note: A previous implementation used shutil.copytree() # however, this copying method is non atomic if not os.path.isdir(checkpoint_model): copy_id = uuid.uuid4() tmp_dst = f"{checkpoint_model}.{copy_id}.tmp" assert os.path.exists(save_path) shutil.copytree(save_path, tmp_dst) try: os.rename(tmp_dst, checkpoint_model) except Exception: shutil.rmtree(tmp_dst) def report(progress_tracker): train_stats = { TRAINING: progress_tracker.train_metrics, VALIDATION: progress_tracker.vali_metrics, TEST: progress_tracker.test_metrics, } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), ) class RayTuneReportCallback(Callback): def _get_sync_client_and_remote_checkpoint_dir(self) -> Optional[Tuple["CommandBasedClient", str]]: # sync client has to be recreated to avoid issues with serialization return tune_executor._get_sync_client_and_remote_checkpoint_dir(trial_dir) def on_trainer_train_setup(self, trainer, save_path, is_coordinator): if is_using_ray_backend and checkpoint_dir and trial_location != ray.util.get_node_ip_address(): save_path = Path(save_path) for path in trial_dir.glob("checkpoint*"): if path not in (save_path.parent, checkpoint_dir): shutil.rmtree(path, ignore_errors=True) sync_info = self._get_sync_client_and_remote_checkpoint_dir() if sync_info is not None: sync_client, remote_checkpoint_dir = sync_info sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait() def on_epoch_end(self, trainer, progress_tracker, save_path): if is_using_ray_backend: save_path = Path(save_path) if trial_location != ray.util.get_node_ip_address(): sync_info = self._get_sync_client_and_remote_checkpoint_dir() if sync_info is not None: sync_client, remote_checkpoint_dir = sync_info sync_client.sync_up(str(save_path.parent.parent.absolute()), remote_checkpoint_dir) sync_client.wait() ray_queue.put((progress_tracker, str(save_path))) return checkpoint(progress_tracker, save_path) report(progress_tracker) callbacks = hyperopt_dict.get("callbacks") or [] hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()] # set tune resources if is_using_ray_backend: resources = tune.get_trial_resources() # check if we are using at least 1 gpu per trial use_gpu = bool(self._gpu_resources_per_trial_non_none) # get the resources assigned to the current trial current_resources = resources.required_resources["GPU" if use_gpu else "CPU"] hvd_kwargs = { "num_workers": int(current_resources), "use_gpu": use_gpu, } hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs) logger.debug(f"Trial horovod kwargs: {hvd_kwargs}") stats = [] def _run(): train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, parameters=config, ) stats.append((train_stats, eval_stats)) sync_info = self._get_sync_client_and_remote_checkpoint_dir(trial_dir) if is_using_ray_backend and sync_info is not None: # We have to pull the results to the trial actor # from worker actors, as the Tune session is running # only on the trial actor thread = threading.Thread(target=_run) thread.daemon = True thread.start() sync_client, remote_checkpoint_dir = sync_info def check_queue(): qsize = ray_queue.qsize() if qsize: results = ray_queue.get_nowait_batch(qsize) sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait() for progress_tracker, save_path in results: checkpoint(progress_tracker, str(trial_dir.joinpath(Path(save_path)))) report(progress_tracker) while thread.is_alive(): thread.join(timeout=0) check_queue() time.sleep(0.1) thread.join() check_queue() else: # remove threading overhead _run() if not stats: raise RuntimeError("Experiment did not complete.") train_stats, eval_stats = stats.pop() metric_score = self.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder), trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )
def track_train(config): tune.report(name=tune.get_trial_name(), trial_id=tune.get_trial_id())
def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = RayTuneSampler.decode_values(config, decode_ctx) trial_id = tune.get_trial_id() modified_config = substitute_parameters( copy.deepcopy(hyperopt_dict["config"]), config) hyperopt_dict['config'] = modified_config hyperopt_dict[ 'experiment_name '] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' tune_executor = self class RayTuneReportCallback(Callback): def on_epoch_end(self, trainer, progress_tracker, save_path): if trainer.is_coordinator(): with tune.checkpoint_dir( step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join( checkpoint_dir, 'model') # shutil.copytree(save_path, checkpoint_model) # Note: A previous implementation used shutil.copytree() # however, this copying method is non atomic if not os.path.isdir(checkpoint_model): copy_id = uuid.uuid4() tmp_dst = "%s.%s.tmp" % (checkpoint_model, copy_id) shutil.copytree(save_path, tmp_dst) try: os.rename(tmp_dst, checkpoint_model) except: shutil.rmtree(tmp_dst) train_stats = { TRAINING: progress_tracker.train_metrics, VALIDATION: progress_tracker.vali_metrics, TEST: progress_tracker.test_metrics, } metric_score = tune_executor.get_metric_score( train_stats, eval_stats=None) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats[TRAINING], cls=NumpyEncoder), eval_stats=json.dumps(train_stats[VALIDATION], cls=NumpyEncoder)) train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, callbacks=[RayTuneReportCallback()], ) metric_score = self.get_metric_score(train_stats, eval_stats) tune.report(parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder))
def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", Dict[str, Any]] = None): """ Main training entry point. Args: model_path (:obj:`str`, `optional`): Local path to the model if the model to train has been instantiated from a local path. If present, training will resume from the optimizer/scheduler states loaded here. trial (:obj:`optuna.Trial` or :obj:`Dict[str, Any]`, `optional`): The trial run or the hyperparameter dictionary for hyperparameter search. """ # This might change the seed so needs to run first. self._hp_search_setup(trial) # Model re-init if self.model_init is not None: # Seed must be set before instantiating the model when using model_init. set_seed(self.args.seed) model = self.model_init() self.model = model.to(self.args.device) # Reinitializes optimizer and scheduler self.optimizer, self.lr_scheduler = None, None # Data loader and number of training steps train_dataloader = self.get_train_dataloader() num_update_steps_per_epoch = len(train_dataloader) // self.args.gradient_accumulation_steps num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = self.args.max_steps // num_update_steps_per_epoch + int( self.args.max_steps % num_update_steps_per_epoch > 0 ) else: t_total = int(num_update_steps_per_epoch * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs self.args.max_steps = t_total self.create_optimizer_and_scheduler(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states self.optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) self.lr_scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16 and _use_apex: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split(os.path.sep)[0]) epochs_trained = self.global_step // num_update_steps_per_epoch steps_trained_in_current_epoch = self.global_step % (num_update_steps_per_epoch) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss_sum = 0.0 loss_sum = defaultdict(float) best = {self.best_metric: None} model.zero_grad() disable_tqdm = self.args.disable_tqdm or not self.is_local_process_zero() train_pbar = trange(epochs_trained, int(np.ceil(num_train_epochs)), desc="Epoch", disable=disable_tqdm) for epoch in range(epochs_trained, int(np.ceil(num_train_epochs))): if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if is_torch_tpu_available(): parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) epoch_iterator = parallel_loader else: epoch_iterator = train_dataloader # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: self._past = None epoch_pbar = tqdm(epoch_iterator, desc="Iteration", disable=disable_tqdm) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 epoch_pbar.update(1) continue model.train() inputs = self._prepare_inputs(inputs) inputs["output_attentions"] = self.length_drop_args.length_config is not None layer_config = sample_layer_configuration( model.config.num_hidden_layers, layer_dropout_prob=self.length_drop_args.layer_dropout_prob, layer_dropout=0, ) inputs["layer_config"] = layer_config inputs["length_config"] = self.length_drop_args.length_config outputs = model(**inputs) # Save past state if it exists if self.args.past_index >= 0: self._past = outputs[self.args.past_index] task_loss = self.div_loss(outputs[0]) if self.length_drop_args.length_adaptive: loss_sum["full"] += task_loss.item() loss = task_loss if self.length_drop_args.length_adaptive: loss = loss / (self.length_drop_args.num_sandwich + 2) tr_loss_sum += loss.item() if self.args.fp16 and _use_native_amp: self.scaler.scale(loss).backward() elif self.args.fp16 and _use_apex: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # inplace distillation if self.length_drop_args.length_adaptive: logits = outputs[1].detach() for i in range(self.length_drop_args.num_sandwich + 1): inputs["output_attentions"] = True layer_config = sample_layer_configuration( model.config.num_hidden_layers, layer_dropout_prob=self.length_drop_args.layer_dropout_prob, layer_dropout=(self.length_drop_args.layer_dropout_bound if i == 0 else None), layer_dropout_bound=self.length_drop_args.layer_dropout_bound, ) inputs["layer_config"] = layer_config length_config = sample_length_configuration( self.args.max_seq_length, model.config.num_hidden_layers, layer_config, length_drop_ratio=(self.length_drop_args.length_drop_ratio_bound if i == 0 else None), length_drop_ratio_bound=self.length_drop_args.length_drop_ratio_bound, ) inputs["length_config"] = length_config outputs_sub = model(**inputs) task_loss_sub = self.div_loss(outputs_sub[0]) if i == 0: loss_sum["smallest"] += task_loss_sub.item() loss_sum["sub"] += 0 else: loss_sum["sub"] += task_loss_sub.item() / self.length_drop_args.num_sandwich logits_sub = outputs_sub[1] loss_fct = KLDivLoss(reduction="batchmean") kl_loss = loss_fct(F.log_softmax(logits, -1), F.softmax(logits_sub, -1)) loss = self.div_loss(kl_loss) loss_sum["kl"] += loss.item() / (self.length_drop_args.num_sandwich + 1) loss = loss / (self.length_drop_args.num_sandwich + 2) tr_loss_sum += loss.item() if self.args.fp16 and _use_native_amp: self.scaler.scale(loss).backward() elif self.args.fp16 and _use_apex: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps (step + 1) == len(epoch_iterator) <= self.args.gradient_accumulation_steps ): if self.args.fp16 and _use_native_amp: self.scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) elif self.args.fp16 and _use_apex: torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_torch_tpu_available(): xm.optimizer_step(self.optimizer) elif self.args.fp16 and _use_native_amp: self.scaler.step(self.optimizer) self.scaler.update() else: self.optimizer.step() self.lr_scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step ): # backward compatibility for pytorch schedulers lr = ( self.lr_scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else self.lr_scheduler.get_lr()[0] ) loss = tr_loss_sum / self.args.logging_steps tr_loss_sum = 0.0 logs = {"lr": lr, "loss": loss} log_str = f"[{self.global_step:5d}] lr {lr:g} | loss {loss:2.3f}" for key, value in loss_sum.items(): value /= self.args.logging_steps loss_sum[key] = 0.0 logs[f"{key}_loss"] = value log_str += f" | {key}_loss {value:2.3f}" self.log(logs, "train") logger.info(log_str) ''' if ( self.args.evaluation_strategy == EvaluationStrategy.STEPS and self.global_step % self.args.eval_steps == 0 ): results = self.evaluate() self._report_to_hp_search(trial, epoch, results) ''' if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert ( model.module is self.model ), f"Module {model.module} should be a reference to self.model" else: assert model is self.model, f"Model {model} should be a reference to self.model" if self.args.evaluate_during_training: results = self.evaluate() results = {k[5:]: v for k, v in results.items() if k.startswith("eval_")} self.log(results, "dev") msg = " | ".join([f"{k} {v:.3f}" for k, v in results.items()]) logger.info(f" [{self.global_step:5d}] {msg}") # Save model checkpoint if self.args.save_only_best: output_dirs = [] else: checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}" if self.hp_search_backend is not None and trial is not None: run_id = ( trial.number if self.hp_search_backend == HPSearchBackend.OPTUNA else tune.get_trial_id() ) checkpoint_folder += f"-run-{run_id}" output_dirs = [os.path.join(self.args.output_dir, checkpoint_folder)] if self.args.evaluate_during_training: if best[self.best_metric] is None or results[self.best_metric] > best[self.best_metric]: logger.info("Congratulations, best model so far!") output_dirs.append(os.path.join(self.args.output_dir, "checkpoint-best")) best = results for output_dir in output_dirs: self.save_model(output_dir) if self.is_world_master() and self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) if self.is_world_process_zero(): self._rotate_checkpoints(use_mtime=True) ''' if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) elif self.is_world_process_zero(): torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) ''' epoch_pbar.update(1) if 0 < self.args.max_steps <= self.global_step: break epoch_pbar.close() train_pbar.update(1) ''' if self.args.evaluation_strategy == EvaluationStrategy.EPOCH: results = self.evaluate() self._report_to_hp_search(trial, epoch, results) ''' if self.args.tpu_metrics_debug or self.args.debug: if is_torch_tpu_available(): # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) else: logger.warning( "You enabled PyTorch/XLA debug metrics but you don't have a TPU " "configured. Check your training configuration if this is unexpected." ) if 0 < self.args.max_steps <= self.global_step: break train_pbar.close() if self.tb_writer: self.tb_writer.close() if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return self.global_step, best
def run_dir(self): # Save model checkpoint if hasattr(self, "_trial"): trial = self._trial else: trial = None if self.hp_search_backend is not None and trial is not None: run_id = trial.number if self.hp_search_backend == HPSearchBackend.OPTUNA else tune.get_trial_id( ) run_name = self.hp_name( trial) if self.hp_name is not None else f"run-{run_id}" run_dir = Path(self.args.output_dir) / run_name else: run_dir = Path(self.args.output_dir) return run_dir
def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", Dict[str, Any]] = None): """ Main training entry point. Args: model_path (:obj:`str`, `optional`): Local path to the model if the model to train has been instantiated from a local path. If present, training will resume from the optimizer/scheduler states loaded here. trial (:obj:`optuna.Trial` or :obj:`Dict[str, Any]`, `optional`): The trial run or the hyperparameter dictionary for hyperparameter search. """ # This might change the seed so needs to run first. self._hp_search_setup(trial) # Model re-init if self.model_init is not None: # Seed must be set before instantiating the model when using model_init. set_seed(self.args.seed) model = self.model_init() self.model = model.to(self.args.device) # Reinitializes optimizer and scheduler self.optimizer, self.lr_scheduler = None, None # Data loader and number of training steps train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs self.args.max_steps = t_total self.create_optimizer_and_scheduler(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states self.optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) self.lr_scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16 and _use_apex: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 self.total_flos = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split(os.path.sep)[0]) self.total_flos = getattr(model.config, "total_flos", 0) epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info(" Continuing training from %d non-embedding floating-point operations", self.total_flos) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 self.total_flos = 0 logger.info(" Starting fine-tuning.") tr_loss = torch.tensor(0.0).to(self.args.device) logging_loss_scalar = 0.0 model.zero_grad() disable_tqdm = self.args.disable_tqdm or not self.is_local_process_zero() train_pbar = trange(epochs_trained, int(np.ceil(num_train_epochs)), desc="Epoch", disable=disable_tqdm) for epoch in range(epochs_trained, int(np.ceil(num_train_epochs))): if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if is_torch_tpu_available(): parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) epoch_iterator = parallel_loader else: epoch_iterator = train_dataloader # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: self._past = None epoch_pbar = tqdm(epoch_iterator, desc="Iteration", disable=disable_tqdm) if (self.reducing_heads or self.annealing) and t_total < self.cooldown_steps: logger.warning("It never cools down!!! total steps: {}".format(t_total)) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 epoch_pbar.update(1) continue if (self.reducing_heads and self.global_step <= self.cooldown_steps): num_of_heads = int(self.starting_num_of_heads - self.global_step / self.cooldown_steps * (self.starting_num_of_heads - self.num_of_heads)) else: num_of_heads = self.num_of_heads # print("num of heads: {}".format(num_of_heads)) if self.ste: model.apply_dropout(num_of_heads, ste=self.ste) else: if (self.annealing and self.global_step <= self.cooldown_steps): temperature = np.exp(np.log(self.starting_temperature) - self.global_step / self.cooldown_steps * (np.log(self.starting_temperature) - np.log(self.temperature))) else: temperature = self.temperature # print("temperature: {}".format(temperature)) model.apply_dropout(num_of_heads, temperature=temperature) tr_loss += self.training_step(model, inputs) self.total_flos += self.floating_point_ops(inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): if self.args.fp16 and _use_native_amp: self.scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) elif self.args.fp16 and _use_apex: torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_torch_tpu_available(): xm.optimizer_step(self.optimizer) elif self.args.fp16 and _use_native_amp: self.scaler.step(self.optimizer) self.scaler.update() else: self.optimizer.step() self.lr_scheduler.step() model.zero_grad() if self.intermediate_masks and (self.global_step % 1000 == 0 or self.global_step == t_total - 1): torch.save(model.get_masks(), os.path.join(self.args.output_dir, "mask" + str(self.global_step) + ".pt")) self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step ): logs: Dict[str, float] = {} tr_loss_scalar = tr_loss.item() logs["loss"] = (tr_loss_scalar - logging_loss_scalar) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( self.lr_scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else self.lr_scheduler.get_lr()[0] ) logging_loss_scalar = tr_loss_scalar self.log(logs) if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 1: metrics = self.evaluate() self._report_to_hp_search(trial, epoch, metrics) if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert ( model.module is self.model ), f"Module {model.module} should be a reference to self.model" else: assert model is self.model, f"Model {model} should be a reference to self.model" # Save model checkpoint checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}" if self.hp_search_backend is not None and trial is not None: run_id = ( trial.number if self.hp_search_backend == HPSearchBackend.OPTUNA else tune.get_trial_id() ) checkpoint_folder += f"-run-{run_id}" output_dir = os.path.join(self.args.output_dir, checkpoint_folder) self.save_model(output_dir) if self.is_world_process_zero(): self._rotate_checkpoints(use_mtime=True) if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) elif self.is_world_process_zero(): torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) epoch_pbar.update(1) if self.args.max_steps > 0 and self.global_step >= self.args.max_steps: break epoch_pbar.close() train_pbar.update(1) if self.args.tpu_metrics_debug or self.args.debug: if is_torch_tpu_available(): # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) else: logger.warning( "You enabled PyTorch/XLA debug metrics but you don't have a TPU " "configured. Check your training configuration if this is unexpected." ) if self.args.max_steps > 0 and self.global_step >= self.args.max_steps: break train_pbar.close() if self.tb_writer: self.tb_writer.close() if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return TrainOutput(self.global_step, tr_loss.item() / self.global_step)
def _save_checkpoint(self, model, trial, metrics=None): """ Compared to original implementation, we change the saving policy to only save the best-validation checkpoints. """ # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we # want to save. assert _model_unwrap(model) is self.model, "internal model should be a reference to self.model" # Determine the new best metric / best model checkpoint if metrics is not None and self.args.metric_for_best_model is not None: metric_to_check = self.args.metric_for_best_model if not metric_to_check.startswith("eval_"): metric_to_check = f"eval_{metric_to_check}" metric_value = metrics[metric_to_check] operator = np.greater if self.args.greater_is_better else np.less if ( self.state.best_metric is None or self.state.best_model_checkpoint is None or operator(metric_value, self.state.best_metric) ): output_dir = self.args.output_dir self.state.best_metric = metric_value self.state.best_model_checkpoint = output_dir # Only save model when it is the best one self.save_model(output_dir) if self.deepspeed: self.deepspeed.save_checkpoint(output_dir) # Save optimizer and scheduler if self.sharded_dpp: self.optimizer.consolidate_state_dict() if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) with warnings.catch_warnings(record=True) as caught_warnings: xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) reissue_pt_warnings(caught_warnings) elif self.is_world_process_zero() and not self.deepspeed: # deepspeed.save_checkpoint above saves model/optim/sched torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) with warnings.catch_warnings(record=True) as caught_warnings: torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) reissue_pt_warnings(caught_warnings) # Save the Trainer state if self.is_world_process_zero(): self.state.save_to_json(os.path.join(output_dir, "trainer_state.json")) else: # Save model checkpoint checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" if self.hp_search_backend is not None and trial is not None: if self.hp_search_backend == HPSearchBackend.OPTUNA: run_id = trial.number else: from ray import tune run_id = tune.get_trial_id() run_name = self.hp_name(trial) if self.hp_name is not None else f"run-{run_id}" output_dir = os.path.join(self.args.output_dir, run_name, checkpoint_folder) else: output_dir = os.path.join(self.args.output_dir, checkpoint_folder) self.store_flos() self.save_model(output_dir) if self.deepspeed: self.deepspeed.save_checkpoint(output_dir) # Save optimizer and scheduler if self.sharded_dpp: self.optimizer.consolidate_state_dict() if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) with warnings.catch_warnings(record=True) as caught_warnings: xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) reissue_pt_warnings(caught_warnings) elif self.is_world_process_zero() and not self.deepspeed: # deepspeed.save_checkpoint above saves model/optim/sched torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) with warnings.catch_warnings(record=True) as caught_warnings: torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) reissue_pt_warnings(caught_warnings) # Save the Trainer state if self.is_world_process_zero(): self.state.save_to_json(os.path.join(output_dir, "trainer_state.json")) # Maybe delete some older checkpoints. if self.is_world_process_zero(): self._rotate_checkpoints(use_mtime=True)
def get_pl_logger(hp: ExperimentParams, tune=None): version = 'local' if tune is None else tune.get_trial_id() logger = MyLightningNeptuneLogger(hp=hp, version=version, offline_mode=hp.offline_mode), return logger
def _run_experiment( self, config, checkpoint_dir, hyperopt_dict, decode_ctx, features_eligible_for_shared_params, is_using_ray_backend=False, ): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = self.decode_values(config, decode_ctx) # Remove mlflow injected config parameters: https://github.com/ludwig-ai/ludwig/issues/2288 if "mlflow" in config: del config["mlflow"] trial_id = tune.get_trial_id() trial_dir = Path(tune.get_trial_dir()) driver_trial_location = ray.util.get_node_ip_address() modified_config = substitute_parameters( copy.deepcopy(hyperopt_dict["config"]), config, features_eligible_for_shared_params) hyperopt_dict["config"] = modified_config hyperopt_dict[ "experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' hyperopt_dict["output_directory"] = str(trial_dir) tune_executor = self if is_using_ray_backend: ray_queue = RayQueue(actor_options={"num_cpus": 0}) else: ray_queue = None def report(progress_tracker): # The progress tracker's metrics are nested dictionaries of TrainerMetrics: feature_name -> metric_name -> # List[TrainerMetric], with one entry per training checkpoint, according to steps_per_checkpoint. # We reduce the dictionary of TrainerMetrics to a simple list of floats for interfacing with Ray Tune. train_stats = { TRAINING: metric_utils.reduce_trainer_metrics_dict( progress_tracker.train_metrics), VALIDATION: metric_utils.reduce_trainer_metrics_dict( progress_tracker.validation_metrics), TEST: metric_utils.reduce_trainer_metrics_dict( progress_tracker.test_metrics), } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), ) class RayTuneReportCallback(Callback): def __init__(self): super().__init__() self.last_steps = 0 def _get_remote_checkpoint_dir( self) -> Optional[Union[str, Tuple[str, str]]]: # sync client has to be recreated to avoid issues with serialization return tune_executor._get_remote_checkpoint_dir(trial_dir) def _checkpoint_progress(self, trainer, progress_tracker, save_path) -> None: """Checkpoints the progress tracker.""" if is_using_ray_backend: save_path = Path(save_path) remote_checkpoint_dir = self._get_remote_checkpoint_dir() if remote_checkpoint_dir is not None: sync_client = tune_executor.sync_client sync_client.sync_up( str(save_path.parent.parent.absolute()), remote_checkpoint_dir) sync_client.wait_or_retry() ray_queue.put((progress_tracker, str(save_path))) return checkpoint(progress_tracker, save_path) def on_trainer_train_setup(self, trainer, save_path, is_coordinator): if is_using_ray_backend and checkpoint_dir and driver_trial_location != ray.util.get_node_ip_address( ): save_path = Path(save_path) for path in trial_dir.glob("checkpoint*"): if path not in (save_path.parent, checkpoint_dir): shutil.rmtree(path, ignore_errors=True) remote_checkpoint_dir = self._get_remote_checkpoint_dir() if remote_checkpoint_dir is not None: sync_client = tune_executor.sync_client sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait_or_retry() def on_eval_end(self, trainer, progress_tracker, save_path): progress_tracker.tune_checkpoint_num += 1 self.last_steps = progress_tracker.steps self._checkpoint_progress(trainer, progress_tracker, save_path) if not is_using_ray_backend: report(progress_tracker) def on_trainer_train_teardown(self, trainer, progress_tracker, save_path, is_coordinator): if is_coordinator and progress_tracker.steps > self.last_steps: # Note: Calling tune.report in both on_eval_end() and here can cause multiprocessing issues # for some ray samplers if not steps have happened since the last eval. self._checkpoint_progress(trainer, progress_tracker, save_path) if not is_using_ray_backend: report(progress_tracker) callbacks = hyperopt_dict.get("callbacks") or [] hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()] # set tune resources if is_using_ray_backend: resources = tune.get_trial_resources() # check if we are using at least 1 gpu per trial use_gpu = bool(self._gpu_resources_per_trial_non_none) # get the resources assigned to the current trial num_gpus = resources.required_resources.get("GPU", 0) num_cpus = resources.required_resources.get( "CPU", 1) if num_gpus == 0 else 0 hvd_kwargs = { "num_workers": int(num_gpus) if use_gpu else 1, "use_gpu": use_gpu, "resources_per_worker": { "CPU": num_cpus, "GPU": 1 if use_gpu else 0, }, } hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs) logger.debug(f"Trial horovod kwargs: {hvd_kwargs}") stats = [] def _run(): train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, parameters=config, ) stats.append((train_stats, eval_stats)) if is_using_ray_backend: # We have to pull the results to the trial actor # from worker actors, as the Tune session is running # only on the trial actor thread = threading.Thread(target=_run) thread.daemon = True thread.start() if self.sync_config is not None: remote_checkpoint_dir = self._get_remote_checkpoint_dir( trial_dir) def check_queue(): qsize = ray_queue.qsize() if qsize: results = ray_queue.get_nowait_batch(qsize) if self.sync_client is not None: self.sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) self.sync_client.wait() for progress_tracker, save_path in results: checkpoint(progress_tracker, str(trial_dir.joinpath(Path(save_path)))) report(progress_tracker) while thread.is_alive(): thread.join(timeout=0) check_queue() time.sleep(0.1) thread.join() check_queue() else: # remove threading overhead _run() if not stats: raise RuntimeError("Experiment did not complete.") train_stats, eval_stats = stats.pop() metric_score = self.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder), trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )
def tune_train(args, model_class, task_info: TaskInfo, build_method=default_build_method, model_kwargs: dict = None, tune_config=None): if model_kwargs is None: model_kwargs = {} this_time = time.strftime("%m-%d_%H:%M:%S", time.localtime()) experiment_name = f'{task_info.task_name}_{this_time}' if tune_config is None: config = { # 3e-4 for Small, 1e-4 for Base, 5e-5 for Large "lr": tune.loguniform(args.tune_min_lr, args.tune_max_lr), # -1 for disable, 0.8 for Base/Small, 0.9 for Large "layerwise_lr_decay_power": tune.choice([0.8, 0.9]), # lr scheduler "lr_scheduler": tune.choice([ 'linear_schedule_with_warmup', 'polynomial_decay_schedule_with_warmup' ]), } else: config = tune_config if torch.cuda.is_available(): resources_per_trial = { "cpu": args.tune_cpus_per_trial, "gpu": args.tune_gpus_per_trial } else: resources_per_trial = {"cpu": args.tune_cpus_per_trial} print("resources_per_trial", resources_per_trial) tune_dir = os.path.abspath('tune_lightning_logs') analysis = tune.run( tune.with_parameters( tune_train_once, args=args, task_info=task_info, model_class=model_class, build_method=build_method, model_kwargs=model_kwargs, resume=args.tune_resume, group=experiment_name, log_dir=tune_dir, ), mode="max", config=config, num_samples=args.tune_num_samples, metric=f'tune_{task_info.metric_name}', name=experiment_name, progress_reporter=CLIReporter( parameter_columns=list(config.keys()), metric_columns=[ "loss", f'tune_{task_info.metric_name}', "training_iteration" ]), callbacks=[TBXLoggerCallback(), CSVLoggerCallback()], resources_per_trial=resources_per_trial, scheduler=ASHAScheduler( max_t=args.max_epochs + 1, # for test grace_period=args.min_epochs), queue_trials=True, keep_checkpoints_num=args.tune_keep_checkpoints_num, checkpoint_score_attr=f'tune_{task_info.metric_name}', local_dir=tune_dir, ) print("Best hyperparameters found were: ", analysis.best_config) print("Best checkpoint: ", analysis.best_checkpoint) args_vars = vars(args) args_vars.update(analysis.best_config) model = model_class.load_from_checkpoint(os.path.join( analysis.best_checkpoint, "tune.ckpt"), hparams=args, **model_kwargs) pl_loggers = [ loggers.CSVLogger(save_dir=tune.get_trial_dir(), name="", version="."), loggers.TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".", default_hp_metric=False), ] try: import wandb pl_loggers.append( loggers.WandbLogger(save_dir=tune_dir, project=args.project, name=tune.get_trial_name(), id=tune.get_trial_id(), offline=args.offline, group=experiment_name)) except Exception: pass trainer: Trainer = Trainer.from_argparse_args(args, logger=pl_loggers) build_method(model, task_info) trainer.test(model)
def tune_train_once(config, checkpoint_dir=None, args: argparse.Namespace = None, model_class: type = None, build_method=None, task_info: TaskInfo = None, model_kwargs: dict = None, resume: str = None, group: str = None, log_dir: str = None, **kwargs): if resume is None: resume = 'all' args_vars = vars(args) args_vars.update(config) pl.seed_everything(args.seed) pl_loggers = [ loggers.CSVLogger(save_dir=tune.get_trial_dir(), name="", version="."), loggers.TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".", default_hp_metric=False), ] try: import wandb pl_loggers.append( loggers.WandbLogger(save_dir=log_dir or 'tune_lightning_logs', project=args.project, name=tune.get_trial_name(), id=tune.get_trial_id(), offline=args.offline, group=group)) except Exception: pass trainer_args = dict( logger=pl_loggers, progress_bar_refresh_rate=0, callbacks=[ TuneReportCheckpointCallback(metrics={ f'tune_{task_info.metric_name}': f'{task_info.task_name}/val_{task_info.metric_name}' }, filename="tune.ckpt", on="validation_end") ]) if checkpoint_dir and resume == 'all': trainer_args['resume_from_checkpoint'] = os.path.join( checkpoint_dir, "tune.ckpt") # fix slurm trainer os.environ["SLURM_JOB_NAME"] = "bash" model = model_class(args, **model_kwargs) build_method(model, task_info) trainer: Trainer = Trainer.from_argparse_args(args, **trainer_args) if checkpoint_dir and resume == 'model': ckpt = pl_load(os.path.join(checkpoint_dir, "tune.ckpt"), map_location=lambda storage, loc: storage) model = model._load_model_state(ckpt) trainer.current_epoch = ckpt["epoch"] trainer.fit(model)