def write_dict_to_tensorboard(writer, val_dict, base_name, iteration): for name, val in val_dict.items(): if isinstance(val, dict): write_dict_to_tensorboard(writer, val, base_name=base_name+"/"+name, iteration=iteration) elif isinstance(val, (list, np.ndarray)): continue elif isinstance(val, (int, float)): writer.add_scalar(base_name + "/" + name, val, iteration) else: if debug_level() == 0: print("Skipping output \""+str(name) + "\" of value " + str(val) + "(%s)" % (val.__class__.__name__))
def eval(self, dataset=None, batch_size=64): if dataset is None: assert self.val_dataset is not None, "[!] ERROR: Validation dataset not loaded. Please load the dataset beforehand for evaluation." dataset = self.val_dataset self.model.eval() # Prepare metrics number_batches = int( math.ceil(dataset.get_num_examples() * 1.0 / batch_size)) eval_metrics = None eval_loss = [] num_counter = [] # Evaluation loop with torch.no_grad(): for batch_ind in range(number_batches): if debug_level() == 0: print("Evaluation process: %4.2f%%" % (100.0 * batch_ind / number_batches), end="\r") # Evaluate single batch batch = dataset.get_batch(batch_size, loop_dataset=False, toTorch=True) batch_loss, additional_metrics = self._eval_batch(batch) if eval_metrics is None: eval_metrics = { metric_name: [metric_val.item()] for metric_name, metric_val in additional_metrics.items() } else: [ eval_metrics[metric_name].append(metric_val.item()) for metric_name, metric_val in additional_metrics.items() ] eval_loss.append(batch_loss.item()) num_counter.append(batch[0].size(0)) mean_loss = sum([n * l for n, l in zip(num_counter, eval_loss) ]) / sum(num_counter) detailed_metrics = { metric_name: sum([n * l for n, l in zip(num_counter, metric_vals)]) / sum(num_counter) for metric_name, metric_vals in eval_metrics.items() } detailed_metrics["eval_loss"] = mean_loss self.model.train() return mean_loss, detailed_metrics
def run_inference(model, input_file, output_file=None, batch_size=64, load_file=True): infer_dataset = create_dataset_from_file(input_file, load_file=load_file) num_batches = int( math.ceil(infer_dataset.get_num_examples() * 1.0 / batch_size)) predictions = list() for batch_index in range(num_batches): if debug_level() == 0: print("Inference process: %4.2f%%" % (100.0 * batch_index / num_batches), end="\r") embeds, lengths, _ = infer_dataset.get_batch(batch_size, loop_dataset=False, toTorch=True) preds = model(words_s1=embeds[0], lengths_s1=lengths[0], words_s2=embeds[1], lengths_s2=lengths[1], applySoftmax=True) _, pred_labels = torch.max(preds, dim=-1) out = torch.squeeze(pred_labels).tolist() predictions += out if isinstance(out, list) else [out] print(preds) out_s = "" for i in range(len(infer_dataset.data_list)): out_s += "=" * 100 + "\n" out_s += " Example %i\n" % (i + 1) out_s += "-" * 100 + "\n" out_s += " Premise: " + infer_dataset.data_list[i].get_premise() + "\n" out_s += " Hypothesis: " + infer_dataset.data_list[i].get_hypothesis( ) + "\n" out_s += " Prediction: " + NLIData.label_to_string( predictions[i]) + "\n" out_s += "=" * 100 + "\n\n" if output_file is not None: with open(output_file, "w") as f: f.write(out_s) print(out_s)
def eval(self, dataset=None, batch_size=64): # Default: if no dataset is specified, we use validation dataset if dataset is None: assert self.val_dataset is not None, "[!] ERROR: Validation dataset not loaded. Please load the dataset beforehand for evaluation." dataset = self.val_dataset self.model.eval() # Prepare metrics number_batches = int( math.ceil(dataset.get_num_examples() * 1.0 / batch_size)) perplexity = [] diversity_unigram, diversity_bigram = None, None # Evaluation loop for batch_ind in range(number_batches): if debug_level() == 0: print("Evaluation process: %4.2f%%" % (100.0 * batch_ind / number_batches), end="\r") # Evaluate single batch batch = dataset.get_batch(batch_size, loop_dataset=False, toTorch=True) batch_labels, perplexity_logits, generated_words, generated_lengths = self._eval_batch( batch) # Perplexity calculation perplexity += TaskTemplate._eval_preplexity( perplexity_logits, batch_labels).cpu().numpy().tolist() loc_div_uni, loc_div_bi = TaskTemplate._eval_diversity( generated_words, generated_lengths, num_classes=perplexity_logits.shape[-1]) if diversity_unigram is None or diversity_bigram is None: diversity_unigram, diversity_bigram = loc_div_uni, loc_div_bi else: diversity_unigram += loc_div_uni diversity_bigram += loc_div_bi diversity_unigram = diversity_unigram.cpu().numpy() diversity_bigram = diversity_bigram.cpu().numpy() # Metric output avg_perplexity = sum(perplexity) / len(perplexity) div_uni_probs = diversity_unigram / max(np.sum(diversity_unigram), 1e-5) div_bi_probs = diversity_bigram / max(np.sum(diversity_bigram), 1e-5) unigram_entropy = -(div_uni_probs * np.log(np.maximum(div_uni_probs, 1e-10))).sum() bigram_entropy = -(div_bi_probs * np.log(np.maximum(div_bi_probs, 1e-10))).sum() unigram_variety = int(np.sum(diversity_unigram > 0)) bigram_variety = int(np.sum(diversity_bigram > 0)) detailed_metrics = { "perplexity": avg_perplexity, "unigram_entropy": unigram_entropy, "bigram_entropy": bigram_entropy, "unigram_variety": unigram_variety, "bigram_variety": bigram_variety } self.model.train() return avg_perplexity, detailed_metrics
def export_best_results(self, checkpoint_path): self.model.eval() # Prepare metrics batch_size = 64 number_batches = int( math.ceil(self.val_dataset.get_num_examples() * 1.0 / batch_size)) number_batches = min(5, number_batches) true_positive_sents = list() false_positive_sents = list() true_negative_sents = list() false_negative_sents = list() # Evaluation loop with torch.no_grad(): for batch_ind in range(number_batches): if debug_level() == 0: print("Evaluation process: %4.2f%%" % (100.0 * batch_ind / number_batches), end="\r") # Evaluate single batch batch = self.val_dataset.get_batch(batch_size, loop_dataset=False, toTorch=True) discriminator_predictions, labels, _, _ = self.model( _input=batch, use_VAE=self.use_VAE, use_semantic_specific_attn=self.use_semantic_specific_attn) positive_predictions = (discriminator_predictions > 0.5).float().cpu().numpy() labels = labels.cpu().numpy() batch = tuple([tensor.cpu().numpy() for tensor in batch]) par_1_words, par_1_lengths, par_2_words, par_2_lengths, par_1_slots, par_1_slot_lengths, par_2_slots, par_2_slot_lengths, contexts_1_words, contexts_1_lengths, contexts_2_words, contexts_2_lengths = batch reconstructed_sents_1 = reconstruct_sentences( par_1_words, par_1_lengths, slot_vals=par_1_slots, slot_lengths=par_1_slot_lengths) reconstructed_sents_2 = reconstruct_sentences( par_2_words, par_2_lengths, slot_vals=par_2_slots, slot_lengths=par_2_slot_lengths) reconstructed_contexts_1 = reconstruct_sentences( contexts_1_words, contexts_1_lengths) reconstructed_contexts_2 = reconstruct_sentences( contexts_2_words, contexts_2_lengths) loc_batch_size = par_1_words.shape[0] for b in range(positive_predictions.shape[0]): semantic_sents = reconstructed_sents_1[ b % loc_batch_size] if b < loc_batch_size or b >= loc_batch_size * 3 else reconstructed_sents_2[ b % loc_batch_size] context_sents = reconstructed_contexts_1[ b % loc_batch_size] if b < loc_batch_size * 2 else reconstructed_contexts_2[ b % loc_batch_size] s = "\n" + "=" * 100 + "\n" + context_sents + "\n" + "-" * 100 + "\nResponse: " + semantic_sents + "\n" + "=" * 100 + "\n" if positive_predictions[b] == 1 and labels[b] == 1: true_positive_sents.append(s) elif positive_predictions[b] == 1 and labels[b] == 0: false_positive_sents.append(s) elif positive_predictions[b] == 0 and labels[b] == 1: false_negative_sents.append(s) elif positive_predictions[b] == 0 and labels[b] == 0: true_negative_sents.append(s) else: print( "[!] ERROR: Something went wrong. Prediction is not any of TP, FP, FN, and TN..." ) sys.exit(1) for sents, filename in zip([ true_positive_sents, false_positive_sents, false_negative_sents, true_negative_sents ], [ "true_positives", "false_positives", "false_negatives", "true_negatives" ]): sents = list(set(sents)) with open( os.path.join(checkpoint_path, "%s_%s.txt" % (self.name, filename)), "w") as f: f.write("\n".join(sents)) self.model.train()
def eval(self, dataset=None, batch_size=64): # Default: if no dataset is specified, we use validation dataset if dataset is None: assert self.val_dataset is not None, "[!] ERROR: Validation dataset not loaded. Please load the dataset beforehand for evaluation." dataset = self.val_dataset self.model.eval() self.classifier.eval() # Prepare metrics number_batches = int( math.ceil(dataset.get_num_examples() * 1.0 / batch_size)) label_list = [] preds_list = [] # Evaluation loop for batch_ind in range(number_batches): if debug_level() == 0: print("Evaluation process: %4.2f%%" % (100.0 * batch_ind / number_batches), end="\r") # Evaluate single batch batch = dataset.get_batch(batch_size, loop_dataset=False, toTorch=True) pred_labels, batch_labels = self._eval_batch(batch) preds_list += torch.squeeze(pred_labels).tolist() label_list += torch.squeeze(batch_labels).tolist() # to_remove = [i for i, l in enumerate(label_list) if l < 0] # for r_index in sorted(to_remove)[::-1]: # del preds_list[r_index] # del label_list[r_index] # Metric output preds_list = np.array(preds_list) label_list = np.array(label_list) preds_list = preds_list[label_list >= 0] label_list = label_list[label_list >= 0] accuracy = np.sum(preds_list == label_list) * 1.0 / preds_list.shape[0] detailed_acc = { "accuracy": accuracy, "predictions": preds_list, "labels": label_list, "class_scores": dict() } print("-" * 75) print("Evaluation accuracy: %4.2f%%" % (accuracy * 100.0)) print("Accuracy per class: ") for c in list(set(label_list)): TP = np.sum(np.logical_and(preds_list == c, label_list == c)) FP = np.sum(np.logical_and(preds_list == c, label_list != c)) FN = np.sum(np.logical_and(preds_list != c, label_list == c)) recall = TP * 1.0 / max(1e-5, TP + FN) precision = TP * 1.0 / max(1e-5, TP + FP) F1_score = 2.0 * TP / max(1e-5, 2 * TP + FP + FN) print( "\t- Class %s: Recall=%4.2f%%, Precision=%4.2f%%, F1 score=%4.2f%%" % (dataset.label_to_string(c), recall * 100.0, precision * 100.0, F1_score * 100.0)) detailed_acc["class_scores"][dataset.label_to_string(c)] = { "recall": recall, "precision": precision, "f1": F1_score } print("-" * 75) self.classifier.train() return accuracy, detailed_acc