def test_dateframe_to_file_csv(filename, gzipped, read, tmp_path):

    rows = [
        ("patient_id", "age", "sex", "bmi", "stp", "date_admitted", "date_died"),
        (1, 20, "M", 18.5, "STP1", "2018-08-01", "2020-05"),
        (2, 38, "F", None, "STP2", "2019-12-12", "2020-06"),
        (3, 65, "M", 0, "STP2", "", "2020-07"),
        (4, 42, "F", 17.8, "", "2020-04-10", "2020-08"),
        (5, 18, "M", 26.2, "STP3", "2020-06-20", ""),
    ]
    covariate_definitions = {
        "population": ("satisfying", {"column_type": "bool"}),
        "age": ("age_as_of", {"column_type": "int"}),
        "sex": ("sex", {"column_type": "str"}),
        "bmi": ("bmi", {"column_type": "float"}),
        "stp": ("practice_as_of", {"column_type": "str"}),
        "date_admitted": ("admitted_to_hospital", {"column_type": "date"}),
        "date_died": ("with_death_recorded_in_cpns", {"column_type": "date"}),
    }

    df = dataframe_from_rows(covariate_definitions, iter(rows))

    path = tmp_path / filename
    dataframe_to_file(df, path)

    if gzipped:
        # test it's valid gzip
        gzip.open(path).read()

    # note: ideally, we'd compare for equality, but this is hard to do, as the
    # types and data can change depending on the output format.  So, we check
    # that we can actually load the serialised data as a basic correctness
    # check.
    read(path)
Exemplo n.º 2
0
def test_validate_dummy_data_valid(file_format, tmpdir):
    rows = zip(
        ["patient_id", "11", "22"],
        ["sex", "F", "M"],
        ["age", 40, 50],
        ["has_event", True, False],
        ["event_date_day", "2021-01-01", None],
        ["event_date_month", "2021-01", None],
        ["event_date_year", "2021", None],
        ["category_date", "2020-10-15", "2021-11-16"],
    )
    path = Path(tmpdir) / f"dummy-data.{file_format}"
    if is_csv_filename(path):
        write_rows_to_csv(rows, path)
    else:
        df = dataframe_from_rows(covariate_definitions_2, rows)
        dataframe_to_file(df, path)
    validate_dummy_data(covariate_definitions_2, path)
Exemplo n.º 3
0
def test_validate_dummy_data_invalid_binary(file_format, subtests, tmpdir):
    # Create some dummy data based on the covariate definitions at the top of the
    # module.
    rows = zip(
        ["patient_id", "11", "22"],
        ["sex", "F", "M"],
        ["age", 40, 50],
        ["has_event", True, False],
        ["event_date_day", "2021-02-03", None],
        ["event_date_month", "2021-02", None],
        ["event_date_year", "2021", None],
    )
    df = dataframe_from_rows(covariate_definitions, rows)
    path = Path(tmpdir) / f"dummy-data.{file_format}"
    dataframe_to_file(df, path)

    # This checks that the dummy data containing the rows above is valid for our study
    # definition, which ensures that the DummyDataValidationErrors caught below are
    # legit.
    validate_dummy_data(covariate_definitions, path)

    # Create some invalid dummy data by taking the valid dummy data created above and
    # changing the covariate_definitions definitions by switching each pair of
    # covariates in turn.  The data doesn't change, but the covariates do.  This works
    # because each column of the dummy data has a different validator.
    for key1 in df.columns:
        if key1 == "patient_id":
            continue

        for key2 in df.columns:
            if key2 == "patient_id":
                continue

            if key1 >= key2:
                continue

            with subtests.test(f"{key1} {key2}"):
                new_covariate_definitions = covariate_definitions.copy()
                new_covariate_definitions[key1] = covariate_definitions[key2]
                new_covariate_definitions[key2] = covariate_definitions[key1]
                with pytest.raises(DummyDataValidationError, match="Invalid value"):
                    validate_dummy_data(new_covariate_definitions, path)
def test_dataframe_from_rows():
    rows = [
        ("patient_id", "age", "sex", "bmi", "stp", "date_admitted", "date_died"),
        (1, 20, "M", 18.5, "STP1", "2018-08-01", "2020-05"),
        (2, 38, "F", None, "STP2", "2019-12-12", "2020-06"),
        (3, 65, "M", 0, "STP2", "", "2020-07"),
        (4, 42, "F", 17.8, "", "2020-04-10", "2020-08"),
        (5, 18, "M", 26.2, "STP3", "2020-06-20", ""),
        (6, 44, "M", 14.2, "STP3", "9999-12-31", "9999-12-31 00:00:00"),
    ]
    covariate_definitions = {
        "population": ("satisfying", {"column_type": "bool"}),
        "age": ("age_as_of", {"column_type": "int"}),
        "sex": ("sex", {"column_type": "str"}),
        "bmi": ("bmi", {"column_type": "float"}),
        "stp": ("practice_as_of", {"column_type": "str"}),
        "date_admitted": ("admitted_to_hospital", {"column_type": "date"}),
        "date_died": ("with_death_recorded_in_cpns", {"column_type": "date"}),
    }
    df = dataframe_from_rows(covariate_definitions, iter(rows))

    expected = [
        {
            "patient_id": 1,
            "age": 20,
            "sex": "M",
            "bmi": 18.5,
            "stp": "STP1",
            "date_admitted": Timestamp("2018-08-01 00:00:00"),
            "date_died": Timestamp("2020-05-01 00:00:00"),
        },
        {
            "patient_id": 2,
            "age": 38,
            "sex": "F",
            "bmi": None,
            "stp": "STP2",
            "date_admitted": Timestamp("2019-12-12 00:00:00"),
            "date_died": Timestamp("2020-06-01 00:00:00"),
        },
        {
            "patient_id": 3,
            "age": 65,
            "sex": "M",
            "bmi": 0.0,
            "stp": "STP2",
            "date_admitted": NaT,
            "date_died": Timestamp("2020-07-01 00:00:00"),
        },
        {
            "patient_id": 4,
            "age": 42,
            "sex": "F",
            "bmi": 17.8,
            "stp": None,
            "date_admitted": Timestamp("2020-04-10 00:00:00"),
            "date_died": Timestamp("2020-08-01 00:00:00"),
        },
        {
            "patient_id": 5,
            "age": 18,
            "sex": "M",
            "bmi": 26.2,
            "stp": "STP3",
            "date_admitted": Timestamp("2020-06-20 00:00:00"),
            "date_died": NaT,
        },
        {
            "patient_id": 6,
            "age": 44,
            "sex": "M",
            "bmi": 14.2,
            "stp": "STP3",
            # check both date strings are clamped to max
            "date_admitted": Timestamp.max,
            "date_died": Timestamp.max,
        },
    ]

    # Faff, we can't do equality checks with NaN values so we have to convert
    # them to None
    records = [
        {
            k: v if not (type(v) is float and math.isnan(v)) else None
            for (k, v) in record.items()
        }
        for record in df.to_dict("record")
    ]
    assert records == expected
    assert df.patient_id.dtype == int
    assert df.age.dtype == int
    assert type(df.sex.dtype) == pandas.CategoricalDtype
    assert df.bmi.dtype == float
    assert type(df.stp.dtype) == pandas.CategoricalDtype
    assert df.date_admitted.dtype == numpy.dtype("datetime64[ns]")
    assert df.date_died.dtype == numpy.dtype("datetime64[ns]")