Exemplo n.º 1
0
def test_make_df_from_expectations_with_mean_recorded_value():
    study = StudyDefinition(
        population=patients.all(),
        drug_x=patients.mean_recorded_value(
            codelist(["X"], system="ctv3"),
            on_most_recent_day_of_measurement=True,
            return_expectations={
                "rate": "exponential_increase",
                "date": {"earliest": "1900-01-01", "latest": "today"},
                "incidence": 0.6,
                "float": {"distribution": "normal", "mean": 35, "stddev": 10},
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert abs(35 - int(result["drug_x"].mean())) < 5
Exemplo n.º 2
0
def test_categorical_clinical_events_without_date_dtype_generation():
    categorised_codelist = codelist([("X", "Y")], system="ctv3")
    categorised_codelist.has_categories = True
    study = StudyDefinition(
        population=patients.all(),
        ethnicity=patients.with_these_clinical_events(
            categorised_codelist, returning="category", find_last_match_in_period=True,
        ),
    )

    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "converters": {},
        "date_col_for": {},
        "dtype": {"ethnicity": "category"},
        "parse_dates": [],
    }
Exemplo n.º 3
0
def test_mean_recorded_value_dtype_generation():
    test_codelist = codelist(["X"], system="ctv3")
    study = StudyDefinition(
        population=patients.all(),
        bp_sys=patients.mean_recorded_value(
            test_codelist,
            on_most_recent_day_of_measurement=True,
            on_or_before="2020-02-01",
        ),
        bp_sys_date_measured=patients.date_of("bp_sys", date_format="YYYY-MM"),
    )
    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "converters": {"bp_sys_date_measured": "add_day_to_date"},
        "dtype": {"bp_sys": "float"},
        "date_col_for": {"bp_sys": "bp_sys_date_measured"},
        "parse_dates": ["bp_sys_date_measured"],
    }
Exemplo n.º 4
0
def test_make_df_from_expectations_returning_date_using_defaults():
    study = StudyDefinition(
        default_expectations={
            "date": {"earliest": "1900-01-01", "latest": "today"},
            "rate": "exponential_increase",
            "incidence": 0.2,
        },
        population=patients.all(),
        asthma_condition=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            find_first_match_in_period=True,
            date_format="YYYY-MM-DD",
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result[~pd.isnull(result["asthma_condition"])].min()[0] < "1960-01-01"
def test_stats_logging_with_message_handle_exception(mock_regex, logger):
    mock_regex.match.side_effect = Exception("message error")
    study = StudyDefinition(
        population=patients.all(),
        event=patients.with_these_clinical_events(codelist(["A"], "snomed")),
    )
    study.to_dicts()

    cohortextractor_stats_logs = get_stats_logs(logger.entries)
    timing_logs = get_logs_by_key(cohortextractor_stats_logs, "timing_id")
    sqlserver_stats_logs = get_stats_logs(logger.entries,
                                          event="sqlserver-stats")
    # Study runs OK and we still get the normal cohortextractor-stats timing logs
    assert len(timing_logs) > 0
    # sqlserver-stats logs just consist of the error logs
    for log in sqlserver_stats_logs:
        assert log["description"] == "Exception in SQL server message handling"
        assert str(log["exc_info"]) == "message error"
Exemplo n.º 6
0
def test_bmi_dtype_generation():
    categorised_codelist = codelist([("X", "Y")], system="ctv3")
    categorised_codelist.has_categories = True
    study = StudyDefinition(
        population=patients.all(),
        bmi=patients.most_recent_bmi(
            on_or_after="2010-02-01", minimum_age_at_measurement=16,
        ),
        bmi_date_measured=patients.date_of("bmi", date_format="YYYY-MM"),
    )

    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "converters": {"bmi_date_measured": "add_day_to_date"},
        "dtype": {"bmi": "float"},
        "date_col_for": {"bmi": "bmi_date_measured"},
        "parse_dates": ["bmi_date_measured"],
    }
Exemplo n.º 7
0
def test_clinical_events_numeric_value_dtype_generation():
    test_codelist = codelist(["X"], system="ctv3")
    study = StudyDefinition(
        population=patients.all(),
        creatinine=patients.with_these_clinical_events(
            test_codelist,
            find_last_match_in_period=True,
            on_or_before="2020-02-01",
            returning="numeric_value",
        ),
        creatinine_date=patients.date_of("creatinine", date_format="YYYY-MM"),
    )
    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "converters": {"creatinine_date": "add_day_to_date"},
        "dtype": {"creatinine": "float"},
        "date_col_for": {"creatinine": "creatinine_date"},
        "parse_dates": ["creatinine_date"],
    }
Exemplo n.º 8
0
def test_validate_category_expectations():
    categorised_codelist = codelist([("X", "Y")], system="ctv3")
    categorised_codelist.has_categories = True

    category_definitions = {"A": "sex = 'F'", "B": "sex = 'M'"}
    study = StudyDefinition(population=patients.all())

    # validate against codelists
    with pytest.raises(ValueError):
        study.validate_category_expectations(
            codelist=categorised_codelist,
            return_expectations={"category": {"ratios": {"X": 1}}},
        )
    study.validate_category_expectations(
        codelist=categorised_codelist,
        return_expectations={"category": {"ratios": {"Y": 1}}},
    )

    # validate against definitions
    with pytest.raises(ValueError):
        study.validate_category_expectations(
            category_definitions=category_definitions,
            return_expectations={"category": {"ratios": {"X": 1}}},
        )
    study.validate_category_expectations(
        category_definitions=category_definitions,
        return_expectations={"category": {"ratios": {"A": 1}}},
    )

    # validate that supplied category definitions override categories
    # in codelists
    with pytest.raises(ValueError):
        study.validate_category_expectations(
            codelist=categorised_codelist,
            category_definitions=category_definitions,
            return_expectations={"category": {"ratios": {"Y": 1}}},
        )
    study.validate_category_expectations(
        codelist=categorised_codelist,
        category_definitions=category_definitions,
        return_expectations={"category": {"ratios": {"A": 1}}},
    )
def test_clinical_events_with_date_dtype_generation():
    test_codelist = codelist(["X"], system="ctv3")
    study = StudyDefinition(
        population=patients.all(),
        diabetes=patients.with_these_clinical_events(
            test_codelist,
            return_first_date_in_period=True,
            date_format="YYYY-MM",
        ),
    )

    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "converters": {
            "diabetes": "add_day_to_date"
        },
        "date_col_for": {},
        "dtype": {},
        "parse_dates": ["diabetes"],
    }
Exemplo n.º 10
0
def test_make_df_from_expectations_with_date_filter():
    study = StudyDefinition(
        population=patients.all(),
        asthma_condition=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            between=["2001-12-01", "2002-06-01"],
            returning="date",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "date": {"earliest": "1900-01-01", "latest": "today"},
            },
            find_first_match_in_period=True,
            date_format="YYYY-MM-DD",
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.columns == ["asthma_condition"]
    assert result[~pd.isnull(result["asthma_condition"])].max()[0] <= "2002-06-01"
Exemplo n.º 11
0
def test_make_df_from_expectations_partial_default_overrides():
    study = StudyDefinition(
        default_expectations={
            "date": {"earliest": "1900-01-01", "latest": "today"},
            "rate": "exponential_increase",
            "incidence": 0.2,
        },
        population=patients.all(),
        asthma_condition=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            find_first_match_in_period=True,
            date_format="YYYY",
            return_expectations={"date": {"latest": "2000-01-01"}},
        ),
    )

    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.asthma_condition.astype("float").max() == 2000
Exemplo n.º 12
0
def test_make_df_from_expectations_with_categories_in_codelist_validation():
    categorised_codelist = codelist([("X", "Y")], system="ctv3")
    categorised_codelist.has_categories = True
    study = StudyDefinition(
        population=patients.all(),
        ethnicity=patients.with_these_clinical_events(
            categorised_codelist,
            returning="category",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {"ratios": {"A": 0.3, "B": 0.7}},
                "date": {"earliest": "1900-01-01", "latest": "today"},
            },
            find_last_match_in_period=True,
        ),
    )
    population_size = 10000
    with pytest.raises(ValueError):
        study.make_df_from_expectations(population_size)
Exemplo n.º 13
0
def test_booleans_correctly_handled_in_dummy_data(tmp_path, file_format):
    cl = codelist(["12345"], system="snomed")
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "2020-01-01",
                "latest": "today"
            }
        },
        population=patients.all(),
        has_event=patients.with_these_clinical_events(
            cl,
            returning="binary_flag",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
    )

    filename = tmp_path / f"dummy-data.{file_format}"
    study.to_file(filename, expectations_population=100)

    if file_format in ("csv", "csv.gz"):
        df = pandas.read_csv(filename, dtype=str)
        bools = ("0", "1")
    elif file_format == "feather":
        df = pandas.read_feather(filename)
        bools = (True, False)
    elif file_format in ("dta", "dta.gz"):
        df = pandas.read_stata(filename)
        bools = (0, 1)
    else:
        assert False, f"Unhandled format: {file_format}"

    # Check we've got at least some of each value
    counts = df.has_event.value_counts()
    assert counts[bools[0]] > 10
    assert counts[bools[1]] > 10
Exemplo n.º 14
0
def test_make_df_from_expectations_with_categories():
    categorised_codelist = codelist([("1", "A"), ("2", "B")], system="ctv3")
    categorised_codelist.has_categories = True
    study = StudyDefinition(
        population=patients.all(),
        ethnicity=patients.with_these_clinical_events(
            categorised_codelist,
            returning="category",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {"ratios": {"A": 0.3, "B": 0.7}},
                "date": {"earliest": "1900-01-01", "latest": "today"},
            },
            find_last_match_in_period=True,
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.columns == ["ethnicity"]

    category_counts = result.reset_index().groupby("ethnicity").count()
    assert category_counts.loc["A", :][0] < category_counts.loc["B", :][0]
Exemplo n.º 15
0
)

chronic_cardiac_disease_codes = codelist_from_csv(
    "codelists/opensafely-chronic-cardiac-disease.csv",
    system="ctv3",
    column="CTV3ID")
chronic_liver_disease_codes = codelist_from_csv(
    "codelists/opensafely-chronic-liver-disease.csv",
    system="ctv3",
    column="CTV3ID")
salbutamol_codes = codelist_from_csv(
    "codelists/opensafely-asthma-inhaler-salbutamol-medication.csv",
    system="snomed",
    column="id",
)
systolic_blood_pressure_codes = codelist(["2469."], system="ctv3")
diastolic_blood_pressure_codes = codelist(["246A."], system="ctv3")

study = StudyDefinition(
    index_date="2020-02-01",
    # Configure the expectations framework
    default_expectations={
        "date": {
            "earliest": "1900-01-01",
            "latest": "index_date"
        },
        "rate": "exponential_increase",
    },
    # This line defines the study population
    population=patients.registered_with_one_practice_between(
        "index_date - 1 year", "index_date"),
    column="CTV3ID",
)

chronic_cardiac_disease_codes = codelist_from_csv(
    "codelists/opensafely-chronic-cardiac-disease.csv",
    system="ctv3",
    column="CTV3ID",
)

diabetes_codes = codelist_from_csv(
    "codelists/opensafely-diabetes.csv",
    system="ctv3",
    column="CTV3ID",
)

hba1c_new_codes = codelist(["XaPbt", "Xaeze", "Xaezd"], system="ctv3")
hba1c_old_codes = codelist(["X772q", "XaERo", "XaERp"], system="ctv3")

hypertension_codes = codelist_from_csv(
    "codelists/opensafely-hypertension.csv",
    system="ctv3",
    column="CTV3ID",
)

chronic_respiratory_disease_codes = codelist_from_csv(
    "codelists/opensafely-chronic-respiratory-disease.csv",
    system="ctv3",
    column="CTV3ID",
)

copd_codes = codelist_from_csv(
from cohortextractor import (
    codelist,
    codelist_from_csv,
)

covid_codelist = codelist(["U071", "U072"], system="icd10")

confirmed_covid_codelist = codelist(["U071"], system="icd10")

suspected_covid_codelist = codelist(["U072"], system="icd10")

covid_primary_care_positive_test=codelist_from_csv(
    "codelists/opensafely-covid-identification-in-primary-care-probable-covid-positive-test.csv",
    system="ctv3", 
    column="CTV3ID",
)

covid_primary_care_code=codelist_from_csv(
    "codelists/opensafely-covid-identification-in-primary-care-probable-covid-clinical-code.csv",
    system="ctv3", 
    column="CTV3ID",
)

covid_primary_care_sequalae=codelist_from_csv(
    "codelists/opensafely-covid-identification-in-primary-care-probable-covid-sequelae.csv",
    system="ctv3",
    column="CTV3ID",
)

covid_primary_care_exposure = codelist_from_csv(
    "codelists/opensafely-covid-identification-in-primary-care-exposure-to-disease.csv", 
def test_vaccination_events_sql():
    session = make_session()
    session.add_all([
        # This patient is too old and should be ignored
        Patient(
            DateOfBirth="2002-05-04",
            Vaccinations=[
                Vaccination(
                    VaccinationName="Infanrix Hexa",
                    VaccinationDate="2002-06-01",
                )
            ],
        ),
        # This patient is too young and should be ignored
        Patient(
            DateOfBirth="2019-10-04",
            Vaccinations=[
                Vaccination(
                    VaccinationName="Infanrix Hexa",
                    VaccinationDate="2019-11-04",
                )
            ],
        ),
        Patient(
            DateOfBirth="2018-10-28",
            Vaccinations=[
                Vaccination(
                    VaccinationName="Infanrix Hexa",
                    VaccinationDate="2018-11-01",
                )
            ],
            MedicationIssues=[
                MedicationIssue(
                    MedicationDictionary=MedicationDictionary(
                        DMD_ID="123", MultilexDrug_ID="123"),
                    ConsultationDate="2019-01-01",
                ),
            ],
            CodedEvents=[
                CodedEvent(CTV3Code="abc", ConsultationDate="2019-06-01")
            ],
        ),
    ])
    session.commit()
    sql = vaccination_events_sql(
        date_of_birth_range=("2012-01-01", "2019-06-01"),
        tpp_vaccination_codelist=codelist(
            [("Infanrix Hexa", "dtap_hex")],
            system="tpp_vaccines",
        ),
        ctv3_codelist=codelist([("abc", "menb")], system="ctv3"),
        snomed_codelist=codelist([("123", "rotavirus")], system="snomed"),
    )
    results = sql_to_dicts(sql)
    result_tuples = [(x["date_given"], x["vaccine_name"]) for x in results]
    # Results are ordered by patient ID but within each patient's results the
    # order is arbitrary. To make testing easier we sort them here.
    result_tuples = sorted(result_tuples)
    assert result_tuples == [
        ("2018-11-01", "dtap_hex"),
        ("2019-01-01", "rotavirus"),
        ("2019-06-01", "menb"),
    ]
from cohortextractor import (codelist, codelist_from_csv)

# Vaccination doses
first_dose_code = codelist("COVRX1_COD", system="snomed")
second_dose_code = codelist("COVRX2_COD", system="snomed")

az_first_dose_code = codelist("AZD1RX_COD", system="snomed")
az_second_dose_code = codelist("AZD2RX_COD", system="snomed")

pf_first_dose_code = codelist("PFD1RX_COD", system="snomed")
pf_second_dose_code = codelist("PFD2RX_COD", system="snomed")

mo_first_dose_code = codelist("MOD1RX_COD", system="snomed")
mo_second_dose_code = codelist("MOD2RX_COD", system="snomed")

nx_first_dose_code = codelist("NXD1RX_COD", system="snomed")
nx_second_dose_code = codelist("NXD2RX_COD", system="snomed")

jn_first_dose_code = codelist("JND1RX_COD", system="snomed")
jn_second_dose_code = codelist("JND2RX_COD", system="snomed")

gs_first_dose_code = codelist("GSD1RX_COD", system="snomed")
gs_second_dose_code = codelist("GSD2RX_COD", system="snomed")

vl_first_dose_code = codelist("VLD1RX_COD", system="snomed")
vl_second_dose_code = codelist("VLD2RX_COD", system="snomed")

# Risk groups
chd_code = codelist("CHD_COV_COD", system="snomed")
resp_code = codelist("RESP_COV_COD", system="snomed")
ckd_code = codelist("CKD_COV_COD", system="snomed")
Exemplo n.º 20
0
def test_to_file_with_dummy_data_file(tmp_path, file_format):
    cl = codelist(["12345"], system="snomed")
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "2020-01-01",
                "latest": "today"
            }
        },
        population=patients.all(),
        sex=patients.sex(return_expectations={
            "category": {
                "ratios": {
                    "F": 0.5,
                    "M": 0.5
                }
            },
            "rate": "universal",
        }, ),
        age=patients.age_as_of(
            "2020-01-01",
            return_expectations={
                "int": {
                    "distribution": "population_ages"
                },
                "rate": "universal",
            },
        ),
        has_event=patients.with_these_clinical_events(
            cl,
            returning="binary_flag",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_day=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM-DD",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_month=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_year=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
    )

    # Generate dummy data using the expectations framework
    dummy_data_file = tmp_path / f"dummy-data.{file_format}"
    study.to_file(dummy_data_file, expectations_population=10)

    # Use this dummy data
    output_file = tmp_path / f"output.{file_format}"
    study.to_file(output_file, dummy_data_file=dummy_data_file)

    # Check results
    with open(dummy_data_file, "rb") as f:
        dummy_data = f.read()

    with open(output_file, "rb") as f:
        expected_output = f.read()

    assert dummy_data == expected_output
from cohortextractor import codelist, codelist_from_csv
stroke = codelist_from_csv(
    "codelists/opensafely-incident-non-traumatic-stroke.csv",
    system="ctv3",
    column="CTV3ID",
)
stroke_hospital = codelist_from_csv(
    "codelists/opensafely-stroke-secondary-care.csv",
    system="icd10",
    column="icd")
aki_codes = codelist(["N17", "N170", "N171", "N172", "N178", "N179"],
                     system="icd10")
mi_codes = codelist_from_csv(
    "codelists/opensafely-myocardial-infarction-2.csv",
    system="ctv3",
    column="CTV3Code",
)
mi_codes_hospital = codelist_from_csv(
    "codelists/opensafely-cardiovascular-secondary-care.csv",
    system="icd10",
    column="icd",
    category_column="mi",
)
heart_failure_codes = codelist_from_csv(
    "codelists/opensafely-heart-failure.csv",
    system="ctv3",
    column="CTV3ID",
)
heart_failure_codes_hospital = codelist_from_csv(
    "codelists/opensafely-cardiovascular-secondary-care.csv",
    system="icd10",
Exemplo n.º 22
0
 def study():
     return StudyDefinition(
         default_expectations={
             "rate": "exponential_increase",
             "incidence": 0.2,
             "date": {
                 "earliest": "1900-01-01",
                 "latest": "today"
             },
         },
         population=patients.all(),
         date_1=patients.with_these_clinical_events(
             codelist(["A"], system="ctv3"),
             returning="date",
             date_format=inconsistent_date_formats.get(
                 "date_1", "YYYY-MM-DD"),
         ),
         first_min_date=patients.minimum_of(
             "date_1",
             date_2=patients.with_these_clinical_events(
                 codelist(["B"], system="ctv3"),
                 returning="date",
                 date_format=inconsistent_date_formats.get(
                     "date_2", "YYYY-MM-DD"),
             ),
         ),
         second_min_date=patients.minimum_of(
             date_3=patients.with_these_clinical_events(
                 codelist(["Y"], system="ctv3"),
                 returning="date",
                 date_format=inconsistent_date_formats.get(
                     "date_3", "YYYY-MM-DD"),
             ),
             date_4=patients.with_these_clinical_events(
                 codelist(["Z"], system="ctv3"),
                 returning="date",
                 date_format=inconsistent_date_formats.get(
                     "date_4", "YYYY-MM-DD"),
             ),
         ),
         third_min_date=patients.minimum_of(
             date_5=patients.with_these_clinical_events(
                 codelist(["Y"], system="ctv3"),
                 returning="date",
                 date_format=inconsistent_date_formats.get(
                     "date_5", "YYYY-MM-DD"),
             ),
             date_6=patients.with_these_clinical_events(
                 codelist(["Z"], system="ctv3"),
                 returning="date",
                 date_format=inconsistent_date_formats.get(
                     "date_6", "YYYY-MM-DD"),
             ),
         ),
         min_of_second_and_third=patients.minimum_of(
             "second_min_date", "third_min_date"),
         min_overall=patients.minimum_of("min_of_second_and_third",
                                         "first_min_date"),
         min_date_1_third_min=patients.minimum_of("date_1",
                                                  "third_min_date"),
     )
Exemplo n.º 23
0
from cohortextractor import(
    codelist, 
    codelist_from_csv,
)

covid_codelist = codelist(["U071", "U072"], system = "icd10")


# https://codelists.opensafely.org/codelist/opensafely/severe-and-profound-learning-disability-flags/44ef542a/
severe_and_profound_learning_disability_codes = codelist_from_csv(
    "codelists/opensafely-severe-and-profound-learning-disability-flags-44ef542a.csv", 
    system = "ctv3", 
    column = "code",
)



# https://codelists.opensafely.org/codelist/opensafely/intellectual-disability-including-downs-syndrome/2020-08-27/
intellectual_disability_including_downs_syndrome_codes = codelist_from_csv(
    "codelists/opensafely-intellectual-disability-including-downs-syndrome-2020-08-27.csv",
    system="ctv3",
    column="CTV3ID",
)



def test_stats_logging_tpp_backend(logger):
    # The query counter is a global at the module level, so it isn't reset between tests
    # Find the next position (without incrementing it); this is the start of the test's timing logs
    start_counter = timing_log_counter.next

    study = StudyDefinition(
        population=patients.all(),
        event=patients.with_these_clinical_events(codelist(["A"], "snomed")),
    )
    study.to_dicts()

    # initial stats
    expected_initial_study_def_logs = [
        # output columns include patient_id, and the 2 variables defined in the
        # study defniiton
        # tables - Patient, temp event table for codelist
        {
            "output_column_count": 3,
            "table_count": 2,
            "table_joins_count": 1
        },
        {
            "variable_count": 2
        },
        {
            "variables_using_codelist_count": 1
        },
        {
            "variable_using_codelist": "event",
            "codelist_size": 1
        },
    ]

    # timing stats
    # logs in tpp_backend during query execution

    expected_timing_log_params = [
        *_sql_execute_timing_logs(
            description="Uploading codelist for event",
            sql="CREATE TABLE #tmp1_event_codelist",
            timing_id=start_counter,
        ),
        *_sql_execute_timing_logs(
            description=None,
            sql=
            "INSERT INTO #tmp1_event_codelist (code, category) VALUES\n[truncated]",
            timing_id=start_counter + 1,
            is_truncated=True,
        ),
        *_sql_execute_timing_logs(
            description="Query for event",
            sql="SELECT * INTO #event",
            timing_id=start_counter + 2,
        ),
        *_sql_execute_timing_logs(
            description="Query for population",
            sql="SELECT * INTO #population",
            timing_id=start_counter + 3,
        ),
        *_sql_execute_timing_logs(
            description="Join all columns for final output",
            sql="JOIN #event ON #event.patient_id = #population.patient_id",
            timing_id=start_counter + 4,
        ),
    ]

    assert_stats_logs(
        logger,
        expected_initial_study_def_logs,
        expected_timing_log_params,
        downloaded=False,
    )
Exemplo n.º 25
0
from pathlib import Path

import pytest

from cohortextractor import StudyDefinition, codelist, patients
from cohortextractor.cohortextractor import SUPPORTED_FILE_FORMATS
from cohortextractor.csv_utils import is_csv_filename, write_rows_to_csv
from cohortextractor.pandas_utils import dataframe_from_rows, dataframe_to_file
from cohortextractor.validate_dummy_data import (
    DummyDataValidationError,
    validate_dummy_data,
)

cl = codelist(["12345"], system="snomed")

column_definitions = dict(
    default_expectations={"date": {"earliest": "2020-01-01", "latest": "today"}},
    population=patients.all(),
    sex=patients.sex(
        return_expectations={
            "category": {"ratios": {"F": 0.5, "M": 0.5}},
            "rate": "universal",
        },
    ),
    age=patients.age_as_of(
        "2020-01-01",
        return_expectations={
            "int": {"distribution": "population_ages"},
            "rate": "universal",
        },
    ),
Exemplo n.º 26
0
def test_study_definition_dummy_data(tmp_path):
    study = VaccinationsStudyDefinition(
        start_date="2017-06-01",
        get_registered_practice_at_months=[12, 24, 60],
        tpp_vaccine_codelist=codelist(
            [
                ("Infanrix Hexa", "dtap_hex"),
                ("Bexsero", "menb"),
                ("Rotarix", "rotavirus"),
                ("Prevenar", "pcv"),
                ("Prevenar - 13", "pcv"),
                ("Menitorix", "hib_menc"),
                ("Repevax", "dtap_ipv"),
                ("Boostrix-IPV", "dtap_ipv"),
                ("MMRvaxPRO", "mmr"),
                ("Priorix", "mmr"),
            ],
            system="tpp_vaccines",
        ),
        ctv3_vaccine_codelist=codelist([("abc", "menb")], system="ctv3"),
        snomed_vaccine_codelist=codelist([("123", "rotavirus")],
                                         system="snomed"),
        event_washout_period=14,
        vaccination_schedule=[
            "dtap_hex_1",
            "menb_1",
            "rotavirus_1",
            "dtap_hex_2",
            "pcv_1",
            "rotavirus_2",
            "dtap_hex_3",
            "menb_2",
            "hib_menc_1",
            "pcv_2",
            "mmr_1",
            "menb_3",
            "dtap_ipv_1",
            "mmr_2",
        ],
    )
    study.to_csv(tmp_path / "dummy.csv", expectations_population=1000)
    with open(tmp_path / "dummy.csv", newline="") as f:
        reader = csv.DictReader(f)
        results = list(reader)
    assert len(results) == 1000
    headers = list(results[0].keys())
    assert headers == [
        "patient_id",
        "date_of_birth",
        "practice_id_at_month_12",
        "practice_id_at_month_24",
        "practice_id_at_month_60",
        "dtap_hex_1",
        "menb_1",
        "rotavirus_1",
        "dtap_hex_2",
        "pcv_1",
        "rotavirus_2",
        "dtap_hex_3",
        "menb_2",
        "hib_menc_1",
        "pcv_2",
        "mmr_1",
        "menb_3",
        "dtap_ipv_1",
        "mmr_2",
    ]
Exemplo n.º 27
0
def test_study_definition(tmp_path):
    session = make_session()
    session.add_all([
        # This patient is too old and should be ignored
        Patient(Patient_ID=1, DateOfBirth="2002-05-04"),
        Patient(
            Patient_ID=2,
            DateOfBirth="2019-01-01",
            RegistrationHistory=[
                RegistrationHistory(
                    StartDate="2019-01-10",
                    EndDate="9999-12-31",
                    Organisation=Organisation(Organisation_ID=678),
                ),
            ],
        ),
        Patient(
            Patient_ID=3,
            DateOfBirth="2018-10-28",
            RegistrationHistory=[
                RegistrationHistory(
                    StartDate="2010-01-01",
                    EndDate="2015-10-01",
                    Organisation=Organisation(Organisation_ID=123),
                ),
                # Deliberately overlapping registration histories
                RegistrationHistory(
                    StartDate="2015-04-01",
                    EndDate="9999-12-31",
                    Organisation=Organisation(Organisation_ID=345),
                ),
            ],
            Vaccinations=[
                Vaccination(
                    VaccinationName="Infanrix Hexa",
                    VaccinationDate="2018-11-01",
                )
            ],
            MedicationIssues=[
                MedicationIssue(
                    MedicationDictionary=MedicationDictionary(
                        DMD_ID="123", MultilexDrug_ID="123"),
                    ConsultationDate="2019-01-01",
                ),
            ],
            CodedEvents=[
                CodedEvent(CTV3Code="abc", ConsultationDate="2019-06-01")
            ],
        ),
    ])
    session.commit()
    study = VaccinationsStudyDefinition(
        start_date="2017-06-01",
        get_registered_practice_at_months=[12, 24, 60],
        tpp_vaccine_codelist=codelist(
            [
                ("Infanrix Hexa", "dtap_hex"),
                ("Bexsero", "menb"),
                ("Rotarix", "rotavirus"),
                ("Prevenar", "pcv"),
                ("Prevenar - 13", "pcv"),
                ("Menitorix", "hib_menc"),
                ("Repevax", "dtap_ipv"),
                ("Boostrix-IPV", "dtap_ipv"),
                ("MMRvaxPRO", "mmr"),
                ("Priorix", "mmr"),
            ],
            system="tpp_vaccines",
        ),
        ctv3_vaccine_codelist=codelist([("abc", "menb")], system="ctv3"),
        snomed_vaccine_codelist=codelist([("123", "rotavirus")],
                                         system="snomed"),
        event_washout_period=14,
        vaccination_schedule=[
            "dtap_hex_1",
            "menb_1",
            "rotavirus_1",
            "dtap_hex_2",
            "pcv_1",
            "rotavirus_2",
            "dtap_hex_3",
            "menb_2",
            "hib_menc_1",
            "pcv_2",
            "mmr_1",
            "menb_3",
            "dtap_ipv_1",
            "mmr_2",
        ],
    )
    study.to_csv(tmp_path / "test.csv")
    with open(tmp_path / "test.csv", newline="") as f:
        reader = csv.DictReader(f)
        results = list(reader)
    assert results == [
        {
            "patient_id": "2",
            "date_of_birth": "2019-01-01",
            "practice_id_at_month_12": "678",
            "practice_id_at_month_24": "678",
            "practice_id_at_month_60": "678",
            "dtap_hex_1": "",
            "menb_1": "",
            "rotavirus_1": "",
            "dtap_hex_2": "",
            "pcv_1": "",
            "rotavirus_2": "",
            "dtap_hex_3": "",
            "menb_2": "",
            "hib_menc_1": "",
            "pcv_2": "",
            "mmr_1": "",
            "menb_3": "",
            "dtap_ipv_1": "",
            "mmr_2": "",
        },
        {
            "patient_id": "3",
            "date_of_birth": "2018-10-01",
            "practice_id_at_month_12": "345",
            "practice_id_at_month_24": "345",
            "practice_id_at_month_60": "345",
            "dtap_hex_1": "2018-11-01",
            "menb_1": "2019-06-01",
            "rotavirus_1": "2019-01-01",
            "dtap_hex_2": "",
            "pcv_1": "",
            "rotavirus_2": "",
            "dtap_hex_3": "",
            "menb_2": "",
            "hib_menc_1": "",
            "pcv_2": "",
            "mmr_1": "",
            "menb_3": "",
            "dtap_ipv_1": "",
            "mmr_2": "",
        },
    ]
Exemplo n.º 28
0
)

oad_med_codes = codelist_from_csv(
    "codelists/opensafely-antidiabetic-drugs.csv",
    system="snomed",
    column="id"
)


insulin_med_codes = codelist_from_csv(
    "codelists/opensafely-insulin-medication.csv", 
    system="snomed", 
    column="id"
)

hba1c_new_codes = codelist(["XaPbt", "Xaeze", "Xaezd"], system="ctv3")
hba1c_old_codes = codelist(["X772q", "XaERo", "XaERp"], system="ctv3")

lung_cancer_codes = codelist_from_csv(
    "codelists/opensafely-lung-cancer.csv", system="ctv3", column="CTV3ID",
)

haem_cancer_codes = codelist_from_csv(
    "codelists/opensafely-haematological-cancer.csv", system="ctv3", column="CTV3ID",
)

other_cancer_codes = codelist_from_csv(
    "codelists/opensafely-cancer-excluding-lung-and-haematological.csv",
    system="ctv3",
    column="CTV3ID",
)
Exemplo n.º 29
0
def test_to_file_with_expectations_population(tmp_path, file_format):
    cl = codelist([("12345", "foo"), ("67890", "bar")], system="snomed")
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "2020-01-01",
                "latest": "today"
            }
        },
        population=patients.all(),
        sex=patients.sex(return_expectations={
            "category": {
                "ratios": {
                    "F": 0.5,
                    "M": 0.5
                }
            },
            "rate": "universal",
        }, ),
        age=patients.age_as_of(
            "2020-01-01",
            return_expectations={
                "int": {
                    "distribution": "population_ages"
                },
                "rate": "universal",
            },
        ),
        has_event=patients.with_these_clinical_events(
            cl,
            returning="binary_flag",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_day=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM-DD",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_month=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_year=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        incomplete_categories=patients.with_these_clinical_events(
            cl,
            returning="category",
            return_expectations={
                "category": {
                    "ratios": {
                        "foo": 0.5,
                        "bar": 0.5
                    }
                },
                # Half the values here should be null
                "incidence": 0.5,
            },
        ),
    )

    dummy_data_file = tmp_path / f"dummy-data.{file_format}"
    study.to_file(dummy_data_file, expectations_population=100)
    # We reuse validate_dummy_data to check that the data generated by the expectations
    # framework is valid.
    validate_dummy_data(study.covariate_definitions, dummy_data_file)
Exemplo n.º 30
0
from cohortextractor import StudyDefinition, patients, codelist

# Define some codelists
cardiac_disease_codes = codelist(["56265001", "127337006"], system="snomedct")
covid_codes = codelist(["U071", "U072"], system="icd10")

study = StudyDefinition(
    # Configure the expectations framework
    default_expectations={
        "date": {
            "earliest": "1900-01-01",
            "latest": "today"
        },
        "rate": "exponential_increase",
    },

    # Define the study population
    population=patients.registered_with_one_practice_between(
        "2019-02-01", "2020-02-01"),

    # Define input variables {
    age=patients.age_as_of(
        "2020-02-01",
        return_expectations={
            "rate": "universal",
            "int": {
                "distribution": "population_ages"
            },
        },
    ),
    sex=patients.sex(return_expectations={