def dump_errors(self, checkpoint_path, output_file="mistakes.txt", task_name=SNLITask.NAME): if task_name == SNLITask.NAME: _, test_dataset, _ = DatasetHandler.load_SNLI_datasets() elif task_name == MNLITask.NAME: _, test_dataset, _ = DatasetHandler.load_MultiNLI_datasets() else: print("Unsupported task: " + str(task_name)) sys.exit(1) evaluation_dict = load_model(checkpoint_path)["evaluation_dict"] task_evaluation = evaluation_dict[max(evaluation_dict.keys())][task_name] predictions = task_evaluation["predictions"] labels = task_evaluation["labels"] mistakes = [i for i, p, l in zip(range(len(predictions)), predictions.tolist(), labels.tolist()) if p != l] print("Number of mistakes: " + str(len(mistakes)) + " | " + str(len(test_dataset.data_list)) + " (%4.2f%%)" % (len(mistakes)*100.0/len(test_dataset.data_list))) print("Confusions:") for l in set(labels): for p in set(predictions): if l == p: continue print("\t- Label %s, pred %s: %i" % (test_dataset.label_to_string(l), test_dataset.label_to_string(p), len([m for m in mistakes if predictions[m]==p and labels[m]==l]))) file_text = "" for example_index in mistakes: file_text += "-"*50 + "\n" file_text += "Label: " + str(test_dataset.label_to_string(labels[example_index])) + ", Prediction: " + str(test_dataset.label_to_string(predictions[example_index])) + "\n" file_text += "Premise: " + " ".join(test_dataset.data_list[example_index].premise_words) + "\n" file_text += "Hypothesis: " + " ".join(test_dataset.data_list[example_index].hypothesis_words) + "\n" with open(output_file, "w") as f: f.write(file_text)
def evaluate_all_models(self, checkpoint_path): checkpoint_files = sorted(glob(os.path.join(checkpoint_path, "*.tar"))) model_results = dict() for i in range(len(checkpoint_files)): checkpoint_dict = load_model(checkpoint_files[i], model=self.model, tasks=self.tasks) epoch = checkpoint_dict["epoch"] model_results[epoch] = dict() model_results[epoch]["checkpoint_file"] = checkpoint_files[i] model_results[epoch]["train"] = self.eval(dataset=self.train_dataset) model_results[epoch]["val"] = self.eval(dataset=self.val_dataset) model_results[epoch]["test"] = self.eval(dataset=self.test_dataset) print("Model at epoch %i achieved %4.2f%% on validation and %4.2f%% on test dataset" % (epoch, 100.0 * model_results[epoch]["val"], 100.0 * model_results[epoch]["test"])) best_acc = { "train": {"acc": 0, "epoch": 0}, "val": {"acc": 0, "epoch": 0}, "test": {"acc": 0, "epoch": 0} } for epoch, epoch_dict in model_results.items(): for data in ["train", "val", "test"]: if epoch_dict[data] > best_acc[data]["acc"]: best_acc[data]["epoch"] = epoch best_acc[data]["acc"] = epoch_dict[data] print("Best train accuracy: %4.2f%% (epoch %i)" % (100.0 * best_acc["train"]["acc"], best_acc["train"]["epoch"])) print("Best validation accuracy: %4.2f%% (epoch %i)" % (100.0 * best_acc["val"]["acc"], best_acc["val"]["epoch"])) print("Best test accuracy: %4.2f%% (epoch %i)" % (100.0 * best_acc["test"]["acc"], best_acc["test"]["epoch"])) return model_results, best_acc
def create_model(checkpoint_path, model_type, model_params): word2vec, word2id, wordvec_tensor = load_word2vec_from_file() model = MultiTaskEncoder(model_type, model_params, wordvec_tensor) _ = load_model(checkpoint_path, model=model) for param in model.parameters(): param.requires_grad = False model.eval() return model
def visualize_tensorboard(self, checkpoint_path, optimizer_params=None, replace_old_files=False, additional_datasets=None): if replace_old_files: for old_tf_file in sorted(glob(os.path.join(checkpoint_path, "events.out.tfevents.*"))): print("Removing " + old_tf_file + "...") os.remove(old_tf_file) writer = SummaryWriter(log_dir=checkpoint_path) # dummy_embeds, dummy_length, _ = self.train_dataset.get_batch(self.batch_size, loop_dataset=False, toTorch=True, bidirectional=self.model.is_bidirectional()) # writer.add_graph(self.model, (dummy_embeds[0], dummy_length[0], dummy_embeds[1], dummy_length[1])) final_dict = load_model(checkpoint_path) for batch in range(len(final_dict["loss_avg_list"])): writer.add_scalar("train/loss", final_dict["loss_avg_list"][batch], batch*50+1) for epoch in range(len(final_dict["eval_accuracies"])): writer.add_scalar("eval/accuracy", final_dict["eval_accuracies"][epoch], epoch+1) if optimizer_params is not None: lr = optimizer_params["lr"] lr_decay_step = optimizer_params["lr_decay_step"] for epoch in range(len(final_dict["eval_accuracies"])): writer.add_scalar("train/learning_rate", lr, epoch+1) if epoch in final_dict["lr_red_step"]: lr *= lr_decay_step # model_results, best_acc = self.evaluate_all_models(checkpoint_path) # for epoch, result_dict in model_results.items(): # for data in ["train", "val", "test"]: # writer.add_scalar("eval/" + data + "_accuracy", result_dict[data], epoch+1) max_acc = max(final_dict["eval_accuracies"]) best_epoch = final_dict["eval_accuracies"].index(max_acc) + 1 load_model(os.path.join(checkpoint_path, "checkpoint_" + str(best_epoch).zfill(3) + ".tar"), model=self.model) visualize_tSNE(self.model, self.test_easy_dataset, writer, embedding_name="Test set easy", add_reduced_version=True) visualize_tSNE(self.model, self.test_hard_dataset, writer, embedding_name="Test set hard", add_reduced_version=True) if additional_datasets is not None: for dataset_name, dataset in additional_datasets.items(): print("Adding embeddings for dataset " + str(dataset_name)) visualize_tSNE(self.model, dataset, writer, embedding_name=dataset_name, add_reduced_version=True) writer.close()
def load_our_model(checkpoint_path): global OUR_MODEL if OUR_MODEL is None: args = load_args(checkpoint_path) print("-> Loading model...") model_params, _ = unsupervised_args_to_params(args) _, _, wordvec_tensor = load_word2vec_from_file() model = ModelUnsupervisedContextParaphrasingTemplate( model_params, wordvec_tensor) print(checkpoint_path) _ = load_model(checkpoint_path, model=model, load_best_model=True) model = model.to(get_device()) model.eval() OUR_MODEL = model return OUR_MODEL
def test_best_model(self, checkpoint_path, main_task=None, delete_others=False, run_standard_eval=True, run_training_set=False, run_sent_eval=True, run_extra_eval=True, light_senteval=True, final_eval_dict=None): if final_eval_dict is None: final_eval_dict = dict() if main_task is None: for t in self.tasks: self.test_best_model(checkpoint_path=checkpoint_path, main_task=t, delete_others=delete_others, run_standard_eval=run_standard_eval, run_training_set=run_training_set, run_sent_eval=False, run_extra_eval=run_extra_eval, light_senteval=True, final_eval_dict=final_eval_dict) main_task = self.tasks[0] else: print("Evaluating with main task " + main_task.name) def iter_to_file(iteration): return os.path.join(checkpoint_path, "checkpoint_" + str(iteration).zfill(7) + ".tar") final_dict = load_model(checkpoint_path) best_acc, best_iter = -1, -1 for eval_iter, eval_dict in final_dict["evaluation_dict"].items(): if main_task.eval_metric(eval_dict[main_task.name]) > best_acc and os.path.isfile(iter_to_file(eval_iter)): best_iter = eval_iter best_acc = main_task.eval_metric(eval_dict[main_task.name]) s = "Best iteration: " + str(best_iter) + " with metric value %4.2f%%" % (best_acc * 100.0) + " on task " + str(main_task.name) + "\n" print(s) best_checkpoint_path = iter_to_file(best_iter) load_model(best_checkpoint_path, model=self.model, tasks=self.tasks) for param in self.model.parameters(): param.requires_grad = False self.model.eval() if run_standard_eval and (main_task.name not in final_eval_dict): acc_dict = {'train' : dict(), 'val' : dict(), 'test' : dict()} if run_training_set: # For training, we evaluate on the very last checkpoint as we expect to have the best training performance there load_model(checkpoint_path, model=self.model, tasks=self.tasks) for t in self.tasks: t_acc, _ = t.eval(dataset=t.train_dataset) acc_dict['train'][t.name] = t_acc # Load best checkpoint again load_model(best_checkpoint_path, model=self.model, tasks=self.tasks) for t in self.tasks: val_acc, detailed_val_acc = t.eval(dataset=t.val_dataset) if t.name == main_task.name and abs(main_task.eval_metric(detailed_val_acc) - best_acc) > 0.0005: print("[!] ERROR: Found different accuracy then reported in the final state dict. Difference: %f" % (100.0 * abs(val_acc - max_acc)) ) return test_acc, detailed_acc = t.eval(dataset=t.test_dataset) acc_dict['val'][t.name] = val_acc acc_dict['test'][t.name] = test_acc acc_dict['test'][t.name + "_detailed"] = detailed_acc final_eval_dict[main_task.name] = acc_dict with open(os.path.join(checkpoint_path, "evaluation.pik"), "wb") as f: pickle.dump(final_eval_dict, f) # if run_extra_eval: # test_easy_acc = self.eval(dataset=self.test_easy_dataset) # test_hard_acc = self.eval(dataset=self.test_hard_dataset) # s = "Test easy accuracy: %4.2f%%\n Test hard accuracy: %4.2f%%\n" % (test_easy_acc*100.0, test_hard_acc*100.0) # with open(os.path.join(checkpoint_path, "extra_evaluation.txt"), "w") as f: # f.write(s) if run_sent_eval: self.model.eval() res = perform_SentEval(self.model, fast_eval=light_senteval) with open(os.path.join(checkpoint_path, "sent_eval.pik"), "wb") as f: pickle.dump(res, f)
def train_model(self, max_iterations=1e6, loss_freq=50, eval_freq=2000, save_freq=1e5, max_gradient_norm=10.0, no_model_checkpoints=False): # Setup training parameters parameters_to_optimize = self._get_all_parameters() print("Trainable model parameters: " + str([ name for name, p in self.model.named_parameters() if p.requires_grad ])) checkpoint_dict = self.load_recent_model() start_iter = get_dict_val(checkpoint_dict, "iteration", 0) evaluation_dict = get_dict_val(checkpoint_dict, "evaluation_dict", dict()) best_save_dict = get_dict_val(checkpoint_dict, "best_save_dict", { "file": None, "metric": -1, "detailed_metrics": None }) best_save_iter = best_save_dict["file"] last_save = None if start_iter == 0 else self.get_checkpoint_filename( start_iter) if last_save is not None and not os.path.isfile(last_save): print( "[!] WARNING: Could not find last checkpoint file specified as " + last_save) last_save = None writer = SummaryWriter(self.checkpoint_path) # Function for saving model. Add here in the dictionary necessary parameters that should be saved def save_train_model(iteration, only_weights=True): if no_model_checkpoints: return checkpoint_dict = { "iteration": iteration, "best_save_dict": best_save_dict } if only_weights: self.save_model(iteration, checkpoint_dict, save_optimizer=False) else: self.save_model(iteration, checkpoint_dict, save_optimizer=True) def export_weight_parameters(iteration): # Export weight distributions for name, param in self.model.named_parameters(): if not param.requires_grad: continue writer.add_histogram(name, param.data.view(-1), global_step=iteration) time_per_step = np.zeros((2, ), dtype=np.float32) train_losses, train_accs = [], [] if start_iter == 0 and writer is not None: export_weight_parameters(0) # Try-catch if user terminates try: print("=" * 50 + "\nStarting training...\n" + "=" * 50) self.model.train() for index_iter in range(start_iter, int(max_iterations)): # Training step start_time = time.time() loss, acc = self.task.train_step(self.batch_size, iteration=index_iter) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(parameters_to_optimize, max_gradient_norm) self.optimizer.step() self.lr_scheduler.step() end_time = time.time() time_per_step[0] += end_time - start_time time_per_step[1] += 1 train_losses.append(loss.item()) train_accs.append(acc.item()) # Debug loss printing if (index_iter + 1) % loss_freq == 0: loss_avg, acc_avg = sum(train_losses) / len( train_losses), sum(train_accs) / len(train_accs) print("Training iteration %i|%i. Loss: %6.5f" % (index_iter + 1, max_iterations, loss_avg)) writer.add_scalar("train/loss", loss_avg, index_iter + 1) writer.add_scalar("train/acc", acc_avg, index_iter + 1) writer.add_scalar("train/learning_rate", self.optimizer.param_groups[0]['lr'], index_iter + 1) writer.add_scalar( "train/training_time", time_per_step[0] / max(1e-5, time_per_step[1]), index_iter + 1) self.task.add_summary(writer, index_iter + 1) time_per_step[:] = 0 train_losses, train_accs = [], [] # Evaluation if (index_iter + 1) % eval_freq == 0: self.model.eval() eval_BLEU, detailed_scores = self.task.eval( batch_size=self.batch_size) self.model.train() write_dict_to_tensorboard(writer, detailed_scores, base_name="eval", iteration=index_iter + 1) if (index_iter + 1) % (eval_freq * 5) == 0: export_weight_parameters(index_iter + 1) if best_save_dict["metric"] < 0 or eval_BLEU > best_save_dict[ "metric"]: # TODO: Test whether this is new best score or not best_save_iter = self.get_checkpoint_filename( index_iter + 1) if not os.path.isfile(best_save_iter): print("Saving model at iteration " + str(index_iter + 1)) save_train_model(index_iter + 1) if best_save_dict[ "file"] is not None and os.path.isfile( best_save_dict["file"]): os.remove(best_save_dict["file"]) if last_save is not None and os.path.isfile( last_save): os.remove(last_save) best_save_dict["file"] = best_save_iter last_save = best_save_iter best_save_dict["metric"] = eval_BLEU best_save_dict["detailed_metrics"] = detailed_scores self.task.export_best_results(self.checkpoint_path, index_iter + 1) evaluation_dict[index_iter + 1] = best_save_dict["metric"] # Saving if (index_iter + 1) % save_freq == 0 and not os.path.isfile( self.get_checkpoint_filename(index_iter + 1)): save_train_model(index_iter + 1) if last_save is not None and os.path.isfile( last_save) and last_save != best_save_iter: os.remove(last_save) last_save = self.get_checkpoint_filename(index_iter + 1) eval_BLEU, detailed_scores = self.task.eval( batch_size=self.batch_size) print("Before reloading, the model achieved a score of %f" % eval_BLEU) if not no_model_checkpoints and best_save_iter is not None: load_model(best_save_iter, model=self.model, optimizer=self.optimizer, lr_scheduler=self.lr_scheduler) eval_BLEU, detailed_scores = self.task.eval( batch_size=self.batch_size) print("Best model achieved %s" % str(eval_BLEU)) if eval_BLEU != best_save_dict["metric"]: print( "[!] WARNING: new evaluation differs from saved one (%s vs %s)!" % (str(eval_BLEU), str(best_save_dict["metric"]))) self.task.finalize_summary(writer, max_iterations, self.checkpoint_path) else: print("Skipping finalizing the summary because %s..." % ("no model checkpoints were saved" if no_model_checkpoints else "best_save_iter was None")) except KeyboardInterrupt: print( "User keyboard interrupt detected. Saving model at step %i..." % (index_iter)) save_train_model(index_iter + 1) if last_save is not None and os.path.isfile(last_save) and not any( [val == last_save for _, val in best_save_dict.items()]): os.remove(last_save) with open(os.path.join(self.checkpoint_path, "results.txt"), "w") as f: for eval_iter, eval_dict in evaluation_dict.items(): f.write("Iteration %i: " % (eval_iter)) f.write("BLEU: %s" % str(best_save_dict["metric"])) f.write("\n") writer.close()
def load_recent_model(self): checkpoint_dict = load_model(self.checkpoint_path, model=self.model, optimizer=self.optimizer, lr_scheduler=self.lr_scheduler) return checkpoint_dict
def load_recent_model(self): checkpoint_dict = load_model(self.checkpoint_path, model=self.model, optimizer=self.optimizer, lr_scheduler=self.lr_scheduler) if len(checkpoint_dict.keys()) > 0: # If checkpoint is not empty, load heads as well for t in self.tasks: t.load_from_dict(checkpoint_dict) return checkpoint_dict