예제 #1
0
def test_compute_statistics_on_empty_set():
    result = compute_statistics([])
    assert result["count"] == 0
    assert all(
        np.isnan(value) for key, value in result.items() if key != "count")
    # we see the same keys as in those cases where we can evaluate something
    dummy_result_non_empty = compute_statistics([1, 2])
    assert set(dummy_result_non_empty.keys()) == set(result.keys())
예제 #2
0
def test_process_results_per_session(seed: int):
    rng = np.random.default_rng(seed=seed)

    (
        per_session_results,
        preceding_user_turn_numbers_used_per_marker,
    ) = _generate_random_examples(num_markers=3, rng=rng)
    markers = sorted(preceding_user_turn_numbers_used_per_marker.keys())
    num_sessions = len(per_session_results)

    stats = MarkerStatistics()
    sender_ids = []
    session_indices = []
    for session_idx, results in enumerate(per_session_results):
        sender_id = str(rng.choice(100))
        session_idx = int(rng.choice(100))
        stats.process(
            session_idx=session_idx,
            sender_id=sender_id,
            meta_data_on_relevant_events_per_marker=results,
        )
        sender_ids.append(sender_id)
        session_indices.append(session_idx)

    assert stats.num_sessions == len(per_session_results)
    for marker in markers:
        for idx in range(num_sessions):
            expected_stats = compute_statistics(
                preceding_user_turn_numbers_used_per_marker[marker][idx])
            for stat_name, stat_value in expected_stats.items():
                assert pytest.approx(
                    stats.session_results[marker][stat_name][idx], stat_value)
    for idx in range(num_sessions):
        assert stats.session_identifier[idx] == (sender_ids[idx],
                                                 session_indices[idx])
예제 #3
0
def test_compute_statistics_simple_check():
    stats = compute_statistics([1, 2, 9, 0])
    assert stats["count"] == 4
    assert stats["min"] == 0
    assert stats["max"] == 9
    assert stats["mean"] == 3
    assert stats[
        "median"] == 1.5  # this is no bug, it is a convention numpy follows
예제 #4
0
def test_overall_statistics_to_csv(tmp_path: Path, seed: int):
    rng = np.random.default_rng(seed=seed)
    (
        per_session_results,
        preceding_user_turn_numbers_used_per_marker,
    ) = _generate_random_examples(num_markers=3,
                                  rng=rng,
                                  num_sessions_min=10,
                                  num_sessions_max=20)
    markers = sorted(preceding_user_turn_numbers_used_per_marker.keys())
    num_sessions = len(per_session_results)

    stats = MarkerStatistics()
    for session_idx, results in enumerate(per_session_results):
        stats.process(
            session_idx=session_idx,
            sender_id=str(rng.choice(100)),
            meta_data_on_relevant_events_per_marker=results,
        )

    tmp_file = tmp_path / "test.csv"
    stats.overall_statistic_to_csv(path=tmp_file)

    with tmp_file.open(mode="r") as f:
        reader = csv.DictReader(f)
        rows = [row for row in reader]

    assert rows[0] == {
        "sender_id": "all",
        "session_idx": "nan",
        "marker": "-",
        "statistic": "total_number_of_sessions",
        "value": str(num_sessions),
    }

    num_digits = 3
    row_idx = 1
    for marker_name in markers:
        assert rows[row_idx] == {
            "sender_id": "all",
            "session_idx": "nan",
            "marker": marker_name,
            "statistic":
            "number_of_sessions_where_marker_applied_at_least_once",
            "value": str(stats.count_if_applied_at_least_once[marker_name]),
        }
        row_idx += 1
        assert rows[row_idx] == {
            "sender_id":
            "all",
            "session_idx":
            "nan",
            "marker":
            marker_name,
            "statistic":
            "percentage_of_sessions_where_marker_applied_at_least_once",
            "value":
            str(
                round(
                    stats.count_if_applied_at_least_once[marker_name] /
                    num_sessions * 100,
                    num_digits,
                )),
        }
        row_idx += 1

    for marker_name in markers:
        statistics = compute_statistics(
            stats.num_preceding_user_turns_collected[marker_name])
        for stat_name, stat_value in statistics.items():
            assert rows[row_idx] == {
                "sender_id": "all",
                "session_idx": "nan",
                "marker": marker_name,
                "statistic":
                MarkerStatistics._add_num_user_turns_str_to(stat_name),
                "value": str(round(stat_value, num_digits)),
            }
            row_idx += 1