def test_make_df_from_expectations_with_categories_expression_validation():
    study = StudyDefinition(
        population=patients.all(),
        category=patients.categorised_as(
            {
                "A": "sex = 'F'",
                "B": "sex = 'M'",
                "": "DEFAULT"
            },
            sex=patients.sex(),
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {
                    "ratios": {
                        "A": 0.3,
                        "B": 0.6,
                        "C": 0.1
                    }
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
        ),
    )
    population_size = 10000
    with pytest.raises(ValueError):
        study.make_df_from_expectations(population_size)
示例#2
0
    def test_script():
        import sys
        from cohortextractor import StudyDefinition, patients

        study = StudyDefinition(
            population=patients.all(),
            sex=patients.sex(
                return_expectations={
                    "rate": "universal",
                    "date": {
                        "earliest": "1900-01-01",
                        "latest": "today"
                    },
                    "category": {
                        "ratios": {
                            "M": 0.49,
                            "F": 0.51
                        }
                    },
                }),
        )
        study.to_csv("/dev/null", expectations_population=10)
        pyodbc = "yes" if "pyodbc" in sys.modules else "no"
        ctds = "yes" if "ctds" in sys.modules else "no"
        print(f"pyodbc: {pyodbc}, ctds: {ctds}")
def test_make_df_from_expectations_with_categories_expression():
    study = StudyDefinition(
        population=patients.all(),
        category=patients.categorised_as(
            {
                "A": "sex = 'F'",
                "B": "sex = 'M'",
                "": "DEFAULT"
            },
            sex=patients.sex(),
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {
                    "ratios": {
                        "A": 0.3,
                        "B": 0.7
                    }
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    value_counts = result.category.value_counts()
    assert value_counts["A"] < value_counts["B"]
示例#4
0
def test_unrecognised_database_url_raises_error(monkeypatch):
    monkeypatch.setenv("DATABASE_URL", "unknown-db://localhost")
    with pytest.raises(ValueError):
        StudyDefinition(
            population=patients.all(),
            sex=patients.sex(),
            age=patients.age_as_of("2020-01-01", ),
        )
示例#5
0
def test_errors_are_triggered_without_database_url(monkeypatch):
    monkeypatch.delenv("DATABASE_URL", raising=False)
    with pytest.raises(KeyError):
        StudyDefinition(
            population=patients.satisfying(
                "no_such_column AND missing_column"),
            sex=patients.sex(),
            age=patients.age_as_of("2020-01-01", ),
        )
示例#6
0
def test_export_data_without_database_url_raises_error(tmp_path, monkeypatch):
    monkeypatch.delenv("DATABASE_URL", raising=False)
    study = StudyDefinition(
        population=patients.all(),
        sex=patients.sex(),
        age=patients.age_as_of("2020-01-01", ),
    )
    with pytest.raises(RuntimeError):
        study.to_file(tmp_path / "dummy_data.csv")
示例#7
0
def test_sex_dtype_generation():
    study = StudyDefinition(population=patients.all(), sex=patients.sex())
    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "dtype": {"sex": "category"},
        "converters": {},
        "date_col_for": {},
        "parse_dates": [],
    }
示例#8
0
def test_syntax_errors_in_expressions_are_raised():
    with pytest.raises(ValueError):
        StudyDefinition(
            population=patients.all(),
            status=patients.satisfying(
                "age > 70 AND AND sex = 'M'",
                sex=patients.sex(),
                age=patients.age_as_of("2010-01-01"),
            ),
        )
示例#9
0
def test_column_name_clashes_produce_errors():
    with pytest.raises(ValueError):
        StudyDefinition(
            population=patients.all(),
            age=patients.age_as_of("2020-01-01"),
            status=patients.satisfying(
                "age > 70 AND sex = 'M'",
                sex=patients.sex(),
                age=patients.age_as_of("2010-01-01"),
            ),
        )
def test_make_df_from_expectations_doesnt_alter_defaults():
    study = StudyDefinition(
        default_expectations={
            "rate": "exponential_increase",
            "incidence": 1.0,
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
            "category": {
                "ratios": {
                    "M": 0.5,
                    "F": 0.5
                }
            },
        },
        population=patients.all(),
        sex_altered=patients.sex(return_expectations={
            "incidence": 0.1,
            "category": {
                "ratios": {
                    "M": 0.5,
                    "F": 0.5
                }
            },
        }),
        sex_default=patients.sex(
            return_expectations={"category": {
                "ratios": {
                    "M": 0.5,
                    "F": 0.5
                }
            }}),
    )
    population_size = 10000
    # Just ensuring no exception is raised
    result = study.make_df_from_expectations(population_size)
    assert len(result[pd.isnull(result.sex_default)]) == 0
示例#11
0
def test_make_df_no_categories_validation_when_no_categories_in_definition():
    study = StudyDefinition(
        population=patients.all(),
        sex=patients.sex(
            return_expectations={
                "rate": "universal",
                "date": {"earliest": "1900-01-01", "latest": "today"},
                "category": {"ratios": {"M": 0.49, "F": 0.51}},
            }
        ),
    )
    population_size = 10000
    # Just ensuring no exception is raised
    study.make_df_from_expectations(population_size)
示例#12
0
def test_create_dummy_data_works_without_database_url(tmp_path, monkeypatch):
    monkeypatch.delenv("DATABASE_URL", raising=False)
    study = StudyDefinition(
        population=patients.all(),
        sex=patients.sex(
            return_expectations={
                "rate": "universal",
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
                "category": {
                    "ratios": {
                        "M": 0.49,
                        "F": 0.51
                    }
                },
            }),
        age=patients.age_as_of(
            "2020-01-01",
            return_expectations={
                "rate": "universal",
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "2020-01-01"
                },
                "int": {
                    "distribution": "population_ages"
                },
            },
        ),
    )
    filename = tmp_path / "dummy_data.csv"
    study.to_file(filename, expectations_population=10)
    with open(filename) as f:
        results = list(csv.DictReader(f))
    assert len(results) == 10
    columns = results[0].keys()
    assert "sex" in columns
    assert "age" in columns
示例#13
0
def test_to_file_with_dummy_data_file(tmp_path, file_format):
    cl = codelist(["12345"], system="snomed")
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "2020-01-01",
                "latest": "today"
            }
        },
        population=patients.all(),
        sex=patients.sex(return_expectations={
            "category": {
                "ratios": {
                    "F": 0.5,
                    "M": 0.5
                }
            },
            "rate": "universal",
        }, ),
        age=patients.age_as_of(
            "2020-01-01",
            return_expectations={
                "int": {
                    "distribution": "population_ages"
                },
                "rate": "universal",
            },
        ),
        has_event=patients.with_these_clinical_events(
            cl,
            returning="binary_flag",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_day=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM-DD",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_month=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_year=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
    )

    # Generate dummy data using the expectations framework
    dummy_data_file = tmp_path / f"dummy-data.{file_format}"
    study.to_file(dummy_data_file, expectations_population=10)

    # Use this dummy data
    output_file = tmp_path / f"output.{file_format}"
    study.to_file(output_file, dummy_data_file=dummy_data_file)

    # Check results
    with open(dummy_data_file, "rb") as f:
        dummy_data = f.read()

    with open(output_file, "rb") as f:
        expected_output = f.read()

    assert dummy_data == expected_output
示例#14
0
     ),
 ),
 age=patients.age_as_of(
     index_date,
     return_expectations={
         "int": {
             "distribution": "population_ages"
         },
         "incidence": 1
     },
 ),
 sex=patients.sex(return_expectations={
     "category": {
         "ratios": {
             "M": 0.49,
             "F": 0.51
         }
     },
     "incidence": 1
 }),
 date_death=patients.died_from_any_cause(
     between=[index_date, end_date],
     returning="date_of_death",
     date_format="YYYY-MM-DD",
     return_expectations={
         "incidence": 0.2,
     },
 ),
 death_category=patients.categorised_as(
     {
         "covid-death": "died_covid",
def test_stats_logging_generate_cohort(
    mock_load,
    _mock_list,
    _mock_check,
    tmp_path,
    logger,
    output_format,
    write_to_file_log,
):
    mock_load.return_value = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
        },
        population=patients.all(),
        sex=patients.sex(),
    )
    # The query counter is a global at the module level, so it isn't reset between tests
    # Find the next position (without incrementing it); this is the start of the test's timing logs
    start_counter = timing_log_counter.next

    generate_cohort(
        output_dir=tmp_path,
        expectations_population=None,
        dummy_data_file=None,
        output_format=output_format,
    )

    # initial stats
    expected_initial_study_def_logs = [
        # these 3 are logged from StudyDefinition instantiation
        # patient_id, population, sex - all from patient table, but we make one temp # table per variable
        {
            "output_column_count": 3,
            "table_count": 2,
            "table_joins_count": 1
        },
        {
            "variable_count": 2
        },  # population, sex
        {
            "variables_using_codelist_count": 0
        },
        # index_date_count logged from generate_cohort
        {
            "index_date_count": 0
        },
    ]

    expected_timing_log_params = [
        # logging the start of overall timing for the cohort generation
        dict(
            description="generate_cohort",
            study_definition="study_definition",
            index_date="all",
            timing="start",
            state="started",
            timing_id=start_counter,
        ),
        dict(
            description="generate_cohort",
            study_definition="study_definition",
            timing="start",
            state="started",
            timing_id=start_counter + 1,
        ),
        # logs in tpp_backend during query execution
        *_sql_execute_timing_logs(
            description="Query for sex",
            sql="SELECT * INTO #sex",
            timing_id=start_counter + 2,
        ),
        *_sql_execute_timing_logs(
            description="Query for population",
            sql="SELECT * INTO #population",
            timing_id=start_counter + 3,
        ),
        # logs specifically from study.to_file
        *_sql_execute_timing_logs(
            description="Writing results into #final_output",
            sql="SELECT * INTO #final_output",
            timing_id=start_counter + 4,
        ),
        *_sql_execute_timing_logs(
            description=None,
            sql="CREATE INDEX ix_patient_id ON #final_output",
            timing_id=start_counter + 5,
        ),
        # results are fetched in batches for writing
        dict(
            description=f"{write_to_file_log} {tmp_path}/input.{output_format}",
            timing="start",
            state="started",
            timing_id=start_counter + 6,
        ),
        *_sql_execute_timing_logs(
            description=None,
            sql="SELECT TOP 32000 * FROM #final_output",
            timing_id=start_counter + 7,
        ),
        dict(
            description="Fetch batched results ",
            timing="start",
            state="started",
            timing_id=start_counter + 8,
        ),
        dict(
            description="Fetch batched results ",
            timing="stop",
            state="ok",
            timing_id=start_counter + 8,
        ),
        dict(
            description=f"{write_to_file_log} {tmp_path}/input.{output_format}",
            timing="stop",
            state="ok",
            timing_id=start_counter + 6,
        ),
        *_sql_execute_timing_logs(
            description="Deleting '#final_output'",
            sql="DROP TABLE #final_output",
            timing_id=start_counter + 9,
        ),
        # logging the overall timing for the cohort generation
        dict(
            description="generate_cohort",
            study_definition="study_definition",
            timing="stop",
            state="ok",
            timing_id=start_counter + 1,
        ),
        dict(
            description="generate_cohort",
            study_definition="study_definition",
            index_date="all",
            timing="stop",
            state="ok",
            timing_id=start_counter,
        ),
    ]

    assert_stats_logs(logger, expected_initial_study_def_logs,
                      expected_timing_log_params)
示例#16
0
def test_to_file_with_expectations_population(tmp_path, file_format):
    cl = codelist([("12345", "foo"), ("67890", "bar")], system="snomed")
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "2020-01-01",
                "latest": "today"
            }
        },
        population=patients.all(),
        sex=patients.sex(return_expectations={
            "category": {
                "ratios": {
                    "F": 0.5,
                    "M": 0.5
                }
            },
            "rate": "universal",
        }, ),
        age=patients.age_as_of(
            "2020-01-01",
            return_expectations={
                "int": {
                    "distribution": "population_ages"
                },
                "rate": "universal",
            },
        ),
        has_event=patients.with_these_clinical_events(
            cl,
            returning="binary_flag",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_day=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM-DD",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_month=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_year=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        incomplete_categories=patients.with_these_clinical_events(
            cl,
            returning="category",
            return_expectations={
                "category": {
                    "ratios": {
                        "foo": 0.5,
                        "bar": 0.5
                    }
                },
                # Half the values here should be null
                "incidence": 0.5,
            },
        ),
    )

    dummy_data_file = tmp_path / f"dummy-data.{file_format}"
    study.to_file(dummy_data_file, expectations_population=100)
    # We reuse validate_dummy_data to check that the data generated by the expectations
    # framework is valid.
    validate_dummy_data(study.covariate_definitions, dummy_data_file)
def test_stats_logging_generate_cohort_with_index_dates(
        mock_load, _mock_list, _mock_check, logger, tmp_path):
    mock_load.return_value = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
        },
        population=patients.all(),
        sex=patients.sex(),
    )
    # The query counter is a global at the module level, so it isn't reset between tests
    # Find the next position (without incrementing it); this is the start of the test's timing logs
    start_counter = timing_log_counter.next

    generate_cohort(
        output_dir=tmp_path,
        expectations_population=None,
        dummy_data_file=None,
        index_date_range="2020-01-01 to 2020-03-01 by month",
    )

    expected_index_dates = ["2020-03-01", "2020-02-01", "2020-01-01"]

    # initial stats
    expected_initial_study_def_logs = [
        # these 3 are logged from StudyDefinition instantiation
        {
            "variable_count": 2
        },  # population, sex
        {
            "variables_using_codelist_count": 0
        },
        # index_date_count logged from generate_cohort
        {
            "index_date_count": 3
        },
        {
            "min_index_date": "2020-01-01",
            "max_index_date": "2020-03-01"
        },
        # output_column/table/joins_count is logged in tpp_backend on backend instantiation so it's repeated for each index date
        *[{
            "output_column_count": 3,
            "table_count": 2,
            "table_joins_count": 1
        }] * 4,
        *[{
            "resetting_backend_index_date": ix_date
        } for ix_date in expected_index_dates],
    ]

    expected_timing_log_params = [
        # logging the start of overall timing for the cohort generation
        dict(
            description="generate_cohort",
            study_definition="study_definition_test",
            index_date="all",
            timing="start",
            state="started",
            timing_id=start_counter,
        )
    ]

    # find the value of the next counter, the start of the timing logs for the first index date
    next_counter = start_counter + 1

    for i, index_date in enumerate(expected_index_dates, start=1):
        expected_timing_log_params.extend([
            dict(
                description="generate_cohort",
                study_definition="study_definition_test",
                timing="start",
                state="started",
                timing_id=next_counter,
            ),
            # logs in tpp_backend during query execution
            *_sql_execute_timing_logs(
                description="Query for sex",
                sql="SELECT * INTO #sex",
                is_truncated=i != 1,
                timing_id=next_counter + 1,
            ),
            *_sql_execute_timing_logs(
                description="Query for population",
                sql="SELECT * INTO #population",
                is_truncated=i != 1,
                timing_id=next_counter + 2,
            ),
            # logs specifically from study.to_file
            *_sql_execute_timing_logs(
                description="Writing results into #final_output",
                sql="SELECT * INTO #final_output",
                is_truncated=i != 1,
                timing_id=next_counter + 3,
            ),
            *_sql_execute_timing_logs(
                description=None,
                sql="CREATE INDEX ix_patient_id ON #final_output",
                timing_id=next_counter + 4,
            ),
            # results are fetched in batches for writing
            dict(
                description=
                f"write_rows_to_csv {tmp_path}/input_test_{index_date}.csv",
                timing="start",
                state="started",
                timing_id=next_counter + 5,
            ),
            *_sql_execute_timing_logs(
                description=None,
                sql="SELECT TOP 32000 * FROM #final_output",
                timing_id=next_counter + 6,
            ),
            dict(
                description="Fetch batched results ",
                timing="start",
                state="started",
                timing_id=next_counter + 7,
            ),
            dict(
                description="Fetch batched results ",
                timing="stop",
                state="ok",
                timing_id=next_counter + 7,
            ),
            dict(
                description=
                f"write_rows_to_csv {tmp_path}/input_test_{index_date}.csv",
                timing="stop",
                state="ok",
                timing_id=next_counter + 5,
            ),
            *_sql_execute_timing_logs(
                description="Deleting '#final_output'",
                sql="DROP TABLE #final_output",
                is_truncated=i != 1,
                timing_id=next_counter + 8,
            ),
            # logging the overall timing for the cohort generation
            dict(
                description="generate_cohort",
                study_definition="study_definition_test",
                timing="stop",
                state="ok",
                timing_id=next_counter,
            ),
        ])
        # set next counter to one more than the max for this index date
        next_counter += 8 + 1

    # add the log for the end of overall timing for the cohort generation; this should have the same
    # id as the first timing log
    expected_timing_log_params.append(
        dict(
            description="generate_cohort",
            study_definition="study_definition_test",
            index_date="all",
            timing="stop",
            state="ok",
            timing_id=start_counter,
        ))
    assert_stats_logs(
        logger,
        expected_initial_study_def_logs,
        expected_timing_log_params,
    )
示例#18
0
             "ratios": {
                 "16 - under 40": 0.25,
                 "40 - under 50": 0.15,
                 "50 - under 65": 0.10,
                 "65 - under 75": 0.25,
                 "75 plus": 0.25,
             }
         },
     },
 ),
 sex=patients.sex(
     return_expectations={
         "rate": "universal",
         "category": {
             "ratios": {
                 "M": 0.39,
                 "F": 0.41,
                 "I": 0.1,
                 "U": 0.1
             }
         },
     }),
 stp=patients.registered_practice_as_of(
     "index_date",
     returning="stp_code",
     return_expectations={
         "category": {
             "ratios": {
                 "STP1": 0.5,
                 "STP2": 0.5
             }
         },
示例#19
0
from cohortextractor.cohortextractor import SUPPORTED_FILE_FORMATS
from cohortextractor.csv_utils import is_csv_filename, write_rows_to_csv
from cohortextractor.pandas_utils import dataframe_from_rows, dataframe_to_file
from cohortextractor.validate_dummy_data import (
    DummyDataValidationError,
    validate_dummy_data,
)

cl = codelist(["12345"], system="snomed")

column_definitions = dict(
    default_expectations={"date": {"earliest": "2020-01-01", "latest": "today"}},
    population=patients.all(),
    sex=patients.sex(
        return_expectations={
            "category": {"ratios": {"F": 0.5, "M": 0.5}},
            "rate": "universal",
        },
    ),
    age=patients.age_as_of(
        "2020-01-01",
        return_expectations={
            "int": {"distribution": "population_ages"},
            "rate": "universal",
        },
    ),
    has_event=patients.with_these_clinical_events(
        cl,
        returning="binary_flag",
        return_expectations={"rate": "uniform", "incidence": 0.5},
    ),
    event_date_day=patients.with_these_clinical_events(