def test_process_results_overall(seed: int): rng = np.random.default_rng(seed=seed) ( per_session_results, preceding_user_turn_numbers_used_per_marker, ) = _generate_random_examples(num_markers=3, rng=rng) markers = sorted(preceding_user_turn_numbers_used_per_marker.keys()) num_sessions = len(per_session_results) stats = MarkerStatistics() for session_idx, results in enumerate(per_session_results): stats.process( session_idx=session_idx, sender_id=str(rng.choice(100)), meta_data_on_relevant_events_per_marker=results, ) assert stats.num_sessions == num_sessions for marker in markers: # count how often we generated some results for a session: number_lists = preceding_user_turn_numbers_used_per_marker[marker] applied_at_least_once = sum( len(sub_list) > 0 for sub_list in number_lists) # and compare that to the expected count: assert stats.count_if_applied_at_least_once[ marker] == applied_at_least_once # check if we collected the all the "preceding user turn numbers" concatenated_numbers = list( itertools.chain.from_iterable( preceding_user_turn_numbers_used_per_marker[marker])) assert stats.num_preceding_user_turns_collected[ marker] == concatenated_numbers
def test_process_results_per_session(seed: int): rng = np.random.default_rng(seed=seed) ( per_session_results, preceding_user_turn_numbers_used_per_marker, ) = _generate_random_examples(num_markers=3, rng=rng) markers = sorted(preceding_user_turn_numbers_used_per_marker.keys()) num_sessions = len(per_session_results) stats = MarkerStatistics() sender_ids = [] session_indices = [] for session_idx, results in enumerate(per_session_results): sender_id = str(rng.choice(100)) session_idx = int(rng.choice(100)) stats.process( session_idx=session_idx, sender_id=sender_id, meta_data_on_relevant_events_per_marker=results, ) sender_ids.append(sender_id) session_indices.append(session_idx) assert stats.num_sessions == len(per_session_results) for marker in markers: for idx in range(num_sessions): expected_stats = compute_statistics( preceding_user_turn_numbers_used_per_marker[marker][idx]) for stat_name, stat_value in expected_stats.items(): assert pytest.approx( stats.session_results[marker][stat_name][idx], stat_value) for idx in range(num_sessions): assert stats.session_identifier[idx] == (sender_ids[idx], session_indices[idx])
def test_per_session_statistics_to_csv(tmp_path: Path, seed: int): rng = np.random.default_rng(seed=seed) ( per_session_results, preceding_user_turn_numbers_used_per_marker, ) = _generate_random_examples(num_markers=3, rng=rng, num_sessions_min=10, num_sessions_max=20) markers = sorted(preceding_user_turn_numbers_used_per_marker.keys()) stats = MarkerStatistics() for session_idx, results in enumerate(per_session_results): stats.process( session_idx=session_idx, sender_id=str(rng.choice(100)), meta_data_on_relevant_events_per_marker=results, ) tmp_file = tmp_path / "test.csv" stats.per_session_statistics_to_csv(path=tmp_file) with tmp_file.open(mode="r") as f: reader = csv.DictReader(f) rows = [row for row in reader] actual_information = {(row["sender_id"], row["session_idx"], row["marker"], row["statistic"]): row["value"] for row in rows} num_digits = 3 expected_information = { ( sender_id, str(session_idx), marker_name, MarkerStatistics._add_num_user_turns_str_to(stat_name), ): str(value) if np.isnan(value) else str(round(value, num_digits)) for marker_name in markers for stat_name, values in stats.session_results[marker_name].items() for (sender_id, session_idx), value in zip(stats.session_identifier, values) } assert actual_information == expected_information
def evaluate_trackers( self, trackers: Iterator[Optional[DialogueStateTracker]], output_file: Path, session_stats_file: Optional[Path] = None, overall_stats_file: Optional[Path] = None, ) -> None: """Collect markers for each dialogue in each tracker loaded. Args: trackers: An iterator over the trackers from which we want to extract markers. output_file: Path to write out the extracted markers. session_stats_file: (Optional) Path to write out statistics about the extracted markers for each session separately. overall_stats_file: (Optional) Path to write out statistics about the markers extracted from all session data. Raises: `FileExistsError` if any of the specified files already exists `NotADirectoryError` if any of the specified files is supposed to be contained in a directory that does not exist """ # Check files and folders before doing the costly swipe over the trackers: for path in [session_stats_file, overall_stats_file, output_file]: if path is not None and path.is_file(): raise FileExistsError( f"Expected that no file {path} already exists.") if path is not None and not path.parent.is_dir(): raise NotADirectoryError( f"Expected directory {path.parent} to exist.") # Apply marker to each session stored in each tracker and save the results. processed_trackers: Dict[Text, List[SessionEvaluation]] = {} for tracker in trackers: if tracker: tracker_result = self.evaluate_events(tracker.events) processed_trackers[tracker.sender_id] = tracker_result processed_trackers_count = len(processed_trackers) telemetry.track_markers_extracted(processed_trackers_count) Marker._save_results(output_file, processed_trackers) # Compute and write statistics if requested. if session_stats_file or overall_stats_file: from rasa.core.evaluation.marker_stats import MarkerStatistics stats = MarkerStatistics() for sender_id, tracker_result in processed_trackers.items(): for session_idx, session_result in enumerate(tracker_result): stats.process( sender_id=sender_id, session_idx=session_idx, meta_data_on_relevant_events_per_marker=session_result, ) telemetry.track_markers_stats_computed(processed_trackers_count) if overall_stats_file: stats.overall_statistic_to_csv(path=overall_stats_file) if session_stats_file: stats.per_session_statistics_to_csv(path=session_stats_file)
def test_overall_statistics_to_csv(tmp_path: Path, seed: int): rng = np.random.default_rng(seed=seed) ( per_session_results, preceding_user_turn_numbers_used_per_marker, ) = _generate_random_examples(num_markers=3, rng=rng, num_sessions_min=10, num_sessions_max=20) markers = sorted(preceding_user_turn_numbers_used_per_marker.keys()) num_sessions = len(per_session_results) stats = MarkerStatistics() for session_idx, results in enumerate(per_session_results): stats.process( session_idx=session_idx, sender_id=str(rng.choice(100)), meta_data_on_relevant_events_per_marker=results, ) tmp_file = tmp_path / "test.csv" stats.overall_statistic_to_csv(path=tmp_file) with tmp_file.open(mode="r") as f: reader = csv.DictReader(f) rows = [row for row in reader] assert rows[0] == { "sender_id": "all", "session_idx": "nan", "marker": "-", "statistic": "total_number_of_sessions", "value": str(num_sessions), } num_digits = 3 row_idx = 1 for marker_name in markers: assert rows[row_idx] == { "sender_id": "all", "session_idx": "nan", "marker": marker_name, "statistic": "number_of_sessions_where_marker_applied_at_least_once", "value": str(stats.count_if_applied_at_least_once[marker_name]), } row_idx += 1 assert rows[row_idx] == { "sender_id": "all", "session_idx": "nan", "marker": marker_name, "statistic": "percentage_of_sessions_where_marker_applied_at_least_once", "value": str( round( stats.count_if_applied_at_least_once[marker_name] / num_sessions * 100, num_digits, )), } row_idx += 1 for marker_name in markers: statistics = compute_statistics( stats.num_preceding_user_turns_collected[marker_name]) for stat_name, stat_value in statistics.items(): assert rows[row_idx] == { "sender_id": "all", "session_idx": "nan", "marker": marker_name, "statistic": MarkerStatistics._add_num_user_turns_str_to(stat_name), "value": str(round(stat_value, num_digits)), } row_idx += 1