def collect_story_predictions( completed_trackers, # type: List[DialogueStateTracker] agent, # type: Agent fail_on_prediction_errors=False, # type: bool use_e2e=False # type: bool ): # type: (...) -> StoryEvalution """Test the stories from a file, running them through the stored model.""" story_eval_store = EvaluationStore() failed = [] correct_dialogues = [] num_stories = len(completed_trackers) logger.info("Evaluating {} stories\n" "Progress:".format(num_stories)) action_list = [] for tracker in tqdm(completed_trackers): tracker_results, predicted_tracker, tracker_actions = \ _predict_tracker_actions(tracker, agent, fail_on_prediction_errors, use_e2e) story_eval_store.merge_store(tracker_results) action_list.extend(tracker_actions) if tracker_results.has_prediction_target_mismatch(): # there is at least one wrong prediction failed.append(predicted_tracker) correct_dialogues.append(0) else: correct_dialogues.append(1) logger.info("Finished collecting predictions.") report, precision, f1, accuracy = get_evaluation_metrics( [1] * len(completed_trackers), correct_dialogues) in_training_data_fraction = _in_training_data_fraction(action_list) log_evaluation_table([1] * len(completed_trackers), "END-TO-END" if use_e2e else "CONVERSATION", report, precision, f1, accuracy, in_training_data_fraction, include_report=False) return (StoryEvalution( evaluation_store=story_eval_store, failed_stories=failed, action_list=action_list, in_training_data_fraction=in_training_data_fraction), num_stories)
def test_entity(): #load model interpreter = Interpreter.load(model_location) duckling_extractors = {"ner_duckling", "ner_duckling_http"} #create dictionary of entity results entity_results = defaultdict(lambda: defaultdict(list)) #get extractors of the interpreter extractors = evaluate.get_entity_extractors(interpreter) #get entity predictions and tokens entity_predictions, tokens = evaluate.get_entity_predictions( interpreter, testing_data) # Create classification report if duckling_extractors.intersection(extractors): entity_predictions = evaluate.remove_duckling_entities( entity_predictions) extractors = evaluate.remove_duckling_extractors(extractors) if not extractors: return entity_results #get entity_targets entity_targets = evaluate.get_entity_targets(testing_data) #get aligned_prections aligned_predictions = evaluate.align_all_entity_predictions( entity_targets, entity_predictions, tokens, extractors) merged_targets = evaluate.merge_labels(aligned_predictions) merged_targets = evaluate.substitute_labels(merged_targets, "O", "no_entity") for extractor in extractors: merged_predictions = evaluate.merge_labels(aligned_predictions, extractor) merged_predictions = evaluate.substitute_labels( merged_predictions, "O", "no_entity") report, precision, f1, accuracy = evaluate.get_evaluation_metrics( merged_targets, merged_predictions) entity_results[extractor]["Accuracy"].append(accuracy) entity_results[extractor]["F1-score"].append(f1) entity_results[extractor]["Precision"].append(precision) print("entity_results: {}\n".format(entity_results), "Classification report: \n{}".format(report))
def test_intent(): #load model interpreter = Interpreter.load(model_location) # get true target of the testing data targets = evaluate.get_intent_targets(testing_data) # get predictions of the testing data predictions = evaluate.get_intent_predictions(interpreter, testing_data) #create a confusion matrix and summary statistics for intent predictions evaluate.evaluate_intents(targets, predictions) #generate classification report, precision, f1 score and accuary report, precision, f1, accuracy = evaluate.get_evaluation_metrics( targets, predictions) print("F1-Score: {}\n".format(f1), "Precision: {}\n".format(precision), "Accuracy: {}\n".format(accuracy), "Classification report: \n{}".format(report))
def log_evaluation_table(golds, predictions, name, include_report=True): # pragma: no cover """Log the sklearn evaluation metrics.""" with warnings.catch_warnings(): warnings.simplefilter("ignore", UndefinedMetricWarning) report, precision, f1, accuracy = get_evaluation_metrics( golds, predictions) logger.info("Evaluation Results on {} level:".format(name)) logger.info("\tCorrect: {} / {}".format(int(len(golds) * accuracy), len(golds))) logger.info("\tF1-Score: {:.3f}".format(f1)) logger.info("\tPrecision: {:.3f}".format(precision)) logger.info("\tAccuracy: {:.3f}".format(accuracy)) if include_report: logger.info("\tClassification report: \n{}".format(report))
def run_story_evaluation(resource_name, agent, max_stories=None, out_file_stories=None, out_file_plot=None, fail_on_prediction_errors=False, use_e2e=False): """Run the evaluation of the stories, optionally plots the results.""" completed_trackers = _generate_trackers(resource_name, agent, max_stories, use_e2e) story_evaluation = collect_story_predictions(completed_trackers, agent, fail_on_prediction_errors, use_e2e) evaluation_store = story_evaluation.evaluation_store with warnings.catch_warnings(): warnings.simplefilter("ignore", UndefinedMetricWarning) report, precision, f1, accuracy = get_evaluation_metrics( evaluation_store.serialise_targets(), evaluation_store.serialise_predictions()) if out_file_plot: plot_story_evaluation(evaluation_store.action_targets, evaluation_store.action_predictions, report, precision, f1, accuracy, story_evaluation.in_training_data_fraction, out_file_plot) log_failed_stories(story_evaluation.failed_stories, out_file_stories) return { "report": report, "precision": precision, "f1": f1, "accuracy": accuracy, "actions": story_evaluation.action_list, "in_training_data_fraction": story_evaluation.in_training_data_fraction, "is_end_to_end_evaluation": use_e2e }
def evaluate(self, data, project=None, model=None): # type: (Text, Optional[Text], Optional[Text]) -> Dict[Text, Any] """Perform a model evaluation.""" project = project or RasaNLUModelConfig.DEFAULT_PROJECT_NAME model = model or None file_name = utils.create_temporary_file(data, "_training_data") test_data = load_data(file_name) if project not in self.project_store: raise InvalidProjectError("Project {} could not " "be found".format(project)) preds_json = self.parse_training_examples(test_data.intent_examples, project, model) predictions = [ {"text": e.text, "intent": e.data.get("intent"), "predicted": p.get("intent", {}).get("name"), "confidence": p.get("intent", {}).get("confidence")} for e, p in zip(test_data.intent_examples, preds_json) ] y_true = [e.data.get("intent") for e in test_data.intent_examples] y_true = clean_intent_labels(y_true) y_pred = [p.get("intent", {}).get("name") for p in preds_json] y_pred = clean_intent_labels(y_pred) report, precision, f1, accuracy = get_evaluation_metrics(y_true, y_pred) return { "intent_evaluation": { "report": report, "predictions": predictions, "precision": precision, "f1_score": f1, "accuracy": accuracy} }
def evaluate_intents(intent_results, report_folder, successes_filename, errors_filename, confmat_filename, intent_hist_filename): # pragma: no cover """Creates a confusion matrix and summary statistics for intent predictions. Log samples which could not be classified correctly and save them to file. Creates a confidence histogram which is saved to file. Wrong and correct prediction confidences will be plotted in separate bars of the same histogram plot. Only considers those examples with a set intent. Others are filtered out. Returns a dictionary of containing the evaluation result.""" # remove empty intent targets num_examples = len(intent_results) intent_results = remove_empty_intent_examples(intent_results) logger.info("Intent Evaluation: Only considering those " "{} examples that have a defined intent out " "of {} examples".format(len(intent_results), num_examples)) targets, predictions = _targets_predictions_from(intent_results) if report_folder: report, precision, f1, accuracy = get_evaluation_metrics( targets, predictions, output_dict=True) report_filename = os.path.join(report_folder, 'intent_report.json') save_json(report, report_filename) logger.info("Classification report saved to {}." .format(report_filename)) else: report, precision, f1, accuracy = get_evaluation_metrics(targets, predictions) log_evaluation_table(report, precision, f1, accuracy) if successes_filename: # save classified samples to file for debugging collect_nlu_successes(intent_results, successes_filename) if errors_filename: # log and save misclassified samples to file for debugging collect_nlu_errors(intent_results, errors_filename) if confmat_filename: from sklearn.metrics import confusion_matrix from sklearn.utils.multiclass import unique_labels import matplotlib.pyplot as plt cnf_matrix = confusion_matrix(targets, predictions) labels = unique_labels(targets, predictions) plot_confusion_matrix(cnf_matrix, classes=labels, title='Intent Confusion matrix', out=confmat_filename) plt.show() plot_intent_confidences(intent_results, intent_hist_filename) plt.show() predictions = [ { "text": res.message, "intent": res.target, "entities": res.target_entities, "predicted_entities": res.entities_prediction, "predicted": res.prediction, "confidence": res.confidence } for res in intent_results ] return { "predictions": predictions, "report": report, "precision": precision, "f1_score": f1, "accuracy": accuracy }