Пример #1
0
def test_suppression_doesnt_affect_later_calculations_on_the_same_data():
    data = pandas.DataFrame({"fish": [2], "litres": [2]})

    m1 = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        small_number_suppression=True,
    )
    r1 = calculate(m1, data)
    assert numpy.isnan(r1.iloc[0]["value"])

    m2 = Measure("ignored-id", numerator="fish", denominator="litres")
    r2 = calculate(m2, data)
    assert r2.iloc[0]["value"] == 1.0
Пример #2
0
def test_reports_suppression_of_small_values():
    m = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        small_number_suppression=True,
    )
    data = pandas.DataFrame({"fish": [1], "litres": [100]}, index=["bowl"])
    reporter = RecordingReporter()
    calculate(m, data, reporter)

    assert "Suppressed small numbers in column fish" in reporter.msg
Пример #3
0
def test_suppresses_denominator_if_its_small_enough():
    m = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        small_number_suppression=True,
    )
    data = pandas.DataFrame({"fish": [0], "litres": [4]}, index=["bag"])
    result = calculate(m, data)

    assert numpy.isnan(result.loc["bag"]["litres"])
    assert numpy.isnan(result.loc["bag"]["value"])
Пример #4
0
def test_suppresses_small_numbers_in_the_numerator():
    m = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        small_number_suppression=True,
    )
    data = pandas.DataFrame({"fish": [1], "litres": [100]}, index=["bowl"])
    result = calculate(m, data)

    assert numpy.isnan(result.loc["bowl"]["fish"])
    assert numpy.isnan(result.loc["bowl"]["value"])
Пример #5
0
def test_throws_away_unused_columns():
    m = Measure("ignored-id", numerator="fish", denominator="litres")
    data = pandas.DataFrame({
        "fish": [10],
        "litres": [1],
        "colour": ["green"],
        "clothing": ["trousers"]
    })
    result = calculate(m, data)
    assert "clothing" not in result.iloc[0]

    m = Measure("ignored-id",
                numerator="fish",
                denominator="litres",
                group_by="colour")
    data = pandas.DataFrame({
        "fish": [10],
        "litres": [1],
        "colour": ["green"],
        "age": [12]
    })
    result = calculate(m, data)
    assert "age" not in result.iloc[0]
Пример #6
0
def test_calculates_quotients():
    m = Measure("ignored-id", numerator="fish", denominator="litres")
    data = pandas.DataFrame(
        {
            "fish": [10, 20, 50],
            "litres": [1, 2, 100]
        },
        index=["small bowl", "large bowl", "pond"],
    )
    result = calculate(m, data)

    assert result.loc["small bowl"]["value"] == 10.0
    assert result.loc["large bowl"]["value"] == 10.0
    assert result.loc["pond"]["value"] == 0.5
Пример #7
0
def test_groups_into_multiple_buckets():
    m = Measure("ignored-id",
                numerator="fish",
                denominator="litres",
                group_by="colour")
    data = pandas.DataFrame({
        "fish": [10, 10],
        "litres": [1, 2],
        "colour": ["gold", "pink"]
    })
    result = calculate(m, data)
    result.set_index("colour", inplace=True)

    assert result.loc["gold"]["value"] == 10.0
    assert result.loc["pink"]["value"] == 5.0
Пример #8
0
def test_reports_suppression_of_extra_values():
    m = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        small_number_suppression=True,
    )
    data = pandas.DataFrame({
        "fish": [2, 10, 8],
        "litres": [10, 10, 10]
    },
                            index=["a", "b", "c"])
    reporter = RecordingReporter()
    calculate(m, data, reporter)

    assert "Additional suppression in column fish" in reporter.msg
Пример #9
0
def test_doesnt_suppress_zero_values():
    m = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        small_number_suppression=True,
    )
    data = pandas.DataFrame({
        "fish": [0, 1],
        "litres": [100, 100]
    },
                            index=["bowl", "bag"])
    result = calculate(m, data)

    assert result.loc["bowl"]["fish"] == 0
    assert result.loc["bowl"]["value"] == 0
Пример #10
0
def test_suppresses_all_equal_extra_values_to_reach_threshold():
    m = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        small_number_suppression=True,
    )
    data = pandas.DataFrame({
        "fish": [1, 10, 10],
        "litres": [10, 10, 10]
    },
                            index=["a", "b", "c"])
    result = calculate(m, data)

    assert numpy.isnan(result.loc["a"]["fish"])
    assert numpy.isnan(result.loc["b"]["fish"])
    assert numpy.isnan(result.loc["c"]["fish"])
Пример #11
0
def test_suppresses_all_small_values_even_if_total_is_way_over_threshold():
    m = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        small_number_suppression=True,
    )
    data = pandas.DataFrame({
        "fish": [2, 2, 2, 2],
        "litres": [10, 10, 10, 10]
    },
                            index=["a", "b", "c", "d"])
    result = calculate(m, data)

    assert numpy.isnan(result.loc["a"]["fish"])
    assert numpy.isnan(result.loc["b"]["fish"])
    assert numpy.isnan(result.loc["c"]["fish"])
    assert numpy.isnan(result.loc["d"]["fish"])
Пример #12
0
def test_groups_data_together():
    m = Measure("ignored-id",
                numerator="fish",
                denominator="litres",
                group_by="colour")
    data = pandas.DataFrame(
        {
            "fish": [10, 20],
            "litres": [1, 2],
            "colour": ["gold", "gold"]
        },
        index=["small bowl", "large bowl"],
    )
    result = calculate(m, data)
    result.set_index("colour", inplace=True)

    assert result.loc["gold"]["fish"] == 30
    assert result.loc["gold"]["litres"] == 3
    assert result.loc["gold"]["value"] == 10.0
Пример #13
0
def test_suppresses_small_numbers_after_grouping():
    m = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        group_by="colour",
        small_number_suppression=True,
    )
    data = pandas.DataFrame({
        "fish": [2, 2, 2, 2, 3, 3],
        "litres": [2, 2, 2, 2, 3, 3],
        "colour": ["gold", "gold", "bronze", "bronze", "pink", "pink"],
    })
    result = calculate(m, data)
    result.set_index("colour", inplace=True)

    assert numpy.isnan(result.loc["gold"]["value"])
    assert numpy.isnan(result.loc["bronze"]["value"])
    assert result.loc["pink"]["value"] == 1.0
Пример #14
0
def test_suppresses_small_numbers_at_threshold_in_the_numerator():
    m = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        small_number_suppression=True,
    )
    data = pandas.DataFrame(
        {
            "fish": [
                measure.SMALL_NUMBER_THRESHOLD,
                measure.SMALL_NUMBER_THRESHOLD,
                measure.SMALL_NUMBER_THRESHOLD + 1,
            ],
            "litres": [100, 100, measure.SMALL_NUMBER_THRESHOLD + 1],
        },
        index=["bowl", "box", "bag"],
    )
    result = calculate(m, data)

    assert numpy.isnan(result.loc["bowl"]["fish"])
    assert numpy.isnan(result.loc["box"]["fish"])
    assert result.loc["bag"]["value"] == 1.0
Пример #15
0
def test_groups_by_multiple_columns():
    m = Measure(
        "ignored-id",
        numerator="fish",
        denominator="litres",
        group_by=["colour", "nationality"],
    )
    data = pandas.DataFrame({
        "fish": [10, 20, 40, 80],
        "litres": [1, 1, 1, 1],
        "colour": ["gold", "gold", "gold", "pink"],
        "nationality": ["russian", "japanese", "russian", "french"],
    })
    result = calculate(m, data)

    assert result.iloc[0]["colour"] == "gold"
    assert result.iloc[0]["nationality"] == "japanese"
    assert result.iloc[0]["fish"] == 20
    assert result.iloc[1]["colour"] == "gold"
    assert result.iloc[1]["nationality"] == "russian"
    assert result.iloc[1]["fish"] == 50
    assert result.iloc[2]["colour"] == "pink"
    assert result.iloc[2]["nationality"] == "french"
    assert result.iloc[2]["fish"] == 80
def test_stats_logging_generate_measures(mock_load, _mock_list, _mock_check,
                                         logger, tmp_path):
    import csv

    from cohortextractor.measure import Measure

    measures = [
        Measure(
            id="has_code",
            numerator="has_code",
            denominator="population",
        ),
        Measure(
            id="has_code_one_group",
            numerator="has_code",
            denominator="population",
            group_by="population",
        ),
    ]

    mock_load.return_value = measures

    # initial stats
    expected_initial_logs = [{"measures_count": 2}]

    # set up an expected input file
    input_filepath = tmp_path / "input_2020-01-01.csv"
    with open(input_filepath, "w") as file_to_write:
        writer = csv.writer(file_to_write)
        writer.writerow(["patient_id", "has_code"])
        writer.writerow([1, 1])
        writer.writerow([2, 1])
        writer.writerow([3, 1])
        writer.writerow([4, 0])

    generate_measures(output_dir=tmp_path)

    stats_logs = get_stats_logs(logger.entries)
    memory_logs = get_logs_by_key(stats_logs, "memory")

    measure_date = "2020-01-01"
    expected_timing_logs = [
        dict(
            description="generate_measures",
            input_file="all",
            study_definition="study_definition",
            timing="start",
            state="started",
        ),
        dict(
            description="generate_measures",
            date=measure_date,
            input_file=str(input_filepath),
            study_definition="study_definition",
            timing="start",
            state="started",
        ),
        dict(
            description="Load patient dataframe for measures",
            date=measure_date,
            input_file=str(input_filepath),
            timing="start",
            state="started",
        ),
        dict(
            description="Load patient dataframe for measures",
            date=measure_date,
            input_file=str(input_filepath),
            timing="stop",
            state="ok",
        ),
        dict(
            description="Calculate measure",
            measure_id="has_code",
            date=measure_date,
            timing="start",
            state="started",
        ),
        dict(
            description="Calculate measure",
            measure_id="has_code",
            date=measure_date,
            timing="stop",
            state="ok",
        ),
        dict(
            description="Calculate measure",
            measure_id="has_code_one_group",
            date=measure_date,
            timing="start",
            state="started",
        ),
        dict(
            description="Calculate measure",
            measure_id="has_code_one_group",
            date=measure_date,
            timing="stop",
            state="ok",
        ),
        dict(
            description="generate_measures",
            date=measure_date,
            input_file=str(input_filepath),
            study_definition="study_definition",
            timing="stop",
            state="ok",
        ),
        dict(
            description="generate_measures",
            input_file="all",
            study_definition="study_definition",
            timing="stop",
            state="ok",
        ),
    ]
    assert_stats_logs(logger, expected_initial_logs + memory_logs,
                      expected_timing_logs)

    expected_memory_logs = [
        ("patient_df", measure_date, "has_code"),
        ("measure_df", measure_date, "has_code"),
        ("measure_df", measure_date, "has_code_one_group"),
    ]
    for i, memory_log in enumerate(memory_logs):
        df, measure_date, measure_id = expected_memory_logs[i]
        assert memory_log["dataframe"] == df
        assert memory_log["date"] == measure_date
        assert memory_log["measure_id"] == measure_id