def evaluate_entities(entity_results, extractors): # pragma: no cover """Creates summary statistics for each entity extractor. Logs precision, recall, and F1 per entity type for each extractor.""" aligned_predictions = align_all_entity_predictions(entity_results, extractors) merged_targets = merge_labels(aligned_predictions) merged_targets = substitute_labels(merged_targets, "O", "no_entity") result = {} for extractor in extractors: merged_predictions = merge_labels(aligned_predictions, extractor) merged_predictions = substitute_labels(merged_predictions, "O", "no_entity") report, precision, f1, accuracy = get_evaluation_metrics( merged_targets, merged_predictions, output_dict=True, exclude_label="no_entity", ) result = { "report": report, "precision": precision, "f1_score": f1, "accuracy": accuracy, } return result
def collect_story_predictions( completed_trackers: List['DialogueStateTracker'], agent: 'Agent', fail_on_prediction_errors: bool = False, use_e2e: bool = False) -> Tuple[StoryEvalution, int]: """Test the stories from a file, running them through the stored model.""" from rasa.nlu.test import get_evaluation_metrics from tqdm import tqdm story_eval_store = EvaluationStore() failed = [] correct_dialogues = [] num_stories = len(completed_trackers) logger.info("Evaluating {} stories\n" "Progress:".format(num_stories)) action_list = [] for tracker in tqdm(completed_trackers): tracker_results, predicted_tracker, tracker_actions = \ _predict_tracker_actions(tracker, agent, fail_on_prediction_errors, use_e2e) story_eval_store.merge_store(tracker_results) action_list.extend(tracker_actions) if tracker_results.has_prediction_target_mismatch(): # there is at least one wrong prediction failed.append(predicted_tracker) correct_dialogues.append(0) else: correct_dialogues.append(1) logger.info("Finished collecting predictions.") with warnings.catch_warnings(): from sklearn.exceptions import UndefinedMetricWarning warnings.simplefilter("ignore", UndefinedMetricWarning) report, precision, f1, accuracy = get_evaluation_metrics( [1] * len(completed_trackers), correct_dialogues) in_training_data_fraction = _in_training_data_fraction(action_list) log_evaluation_table([1] * len(completed_trackers), "END-TO-END" if use_e2e else "CONVERSATION", report, precision, f1, accuracy, in_training_data_fraction, include_report=False) return (StoryEvalution( evaluation_store=story_eval_store, failed_stories=failed, action_list=action_list, in_training_data_fraction=in_training_data_fraction), num_stories)
def test_get_evaluation_metrics(targets, predictions, expected_precision, expected_fscore, expected_accuracy): report, precision, f1, accuracy = get_evaluation_metrics( targets, predictions, True, exclude_label=NO_ENTITY) assert f1 == expected_fscore assert precision == expected_precision assert accuracy == expected_accuracy assert NO_ENTITY not in report
async def test( stories: Text, agent: "Agent", max_stories: Optional[int] = None, out_directory: Optional[Text] = None, fail_on_prediction_errors: bool = False, e2e: bool = False, disable_plotting: bool = False, ): """Run the evaluation of the stories, optionally plot the results.""" from rasa.nlu.test import get_evaluation_metrics completed_trackers = await _generate_trackers(stories, agent, max_stories, e2e) story_evaluation, _ = collect_story_predictions(completed_trackers, agent, fail_on_prediction_errors, e2e) evaluation_store = story_evaluation.evaluation_store with warnings.catch_warnings(): from sklearn.exceptions import UndefinedMetricWarning warnings.simplefilter("ignore", UndefinedMetricWarning) targets, predictions = evaluation_store.serialise() report, precision, f1, accuracy = get_evaluation_metrics( targets, predictions) if out_directory: plot_story_evaluation( evaluation_store.action_targets, evaluation_store.action_predictions, report, precision, f1, accuracy, story_evaluation.in_training_data_fraction, out_directory, disable_plotting, ) log_failed_stories(story_evaluation.failed_stories, out_directory) return { "report": report, "precision": precision, "f1": f1, "accuracy": accuracy, "actions": story_evaluation.action_list, "in_training_data_fraction": story_evaluation.in_training_data_fraction, "is_end_to_end_evaluation": e2e, }
def evaluate_intents(intent_results): # pragma: no cover """Creates a confusion matrix and summary statistics for intent predictions. Log samples which could not be classified correctly and save them to file. Creates a confidence histogram which is saved to file. Wrong and correct prediction confidences will be plotted in separate bars of the same histogram plot. Only considers those examples with a set intent. Others are filtered out. Returns a dictionary of containing the evaluation result. """ # remove empty intent targets intent_results = remove_empty_intent_examples(intent_results) target_intents, predicted_intents = _targets_predictions_from( intent_results, "intent_target", "intent_prediction" ) report, precision, f1, accuracy = get_evaluation_metrics( target_intents, predicted_intents, output_dict=True ) log = collect_nlu_errors(intent_results) + collect_nlu_successes(intent_results) predictions = [ { "text": res.message, "intent": res.intent_target, "predicted": res.intent_prediction, "confidence": res.confidence, } for res in intent_results ] return { "predictions": predictions, "report": report, "precision": precision, "f1_score": f1, "accuracy": accuracy, "log": log, }