def test_make_df_from_expectations_with_mean_recorded_value(): study = StudyDefinition( population=patients.all(), drug_x=patients.mean_recorded_value( codelist(["X"], system="ctv3"), on_most_recent_day_of_measurement=True, return_expectations={ "rate": "exponential_increase", "date": {"earliest": "1900-01-01", "latest": "today"}, "incidence": 0.6, "float": {"distribution": "normal", "mean": 35, "stddev": 10}, }, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert abs(35 - int(result["drug_x"].mean())) < 5
def test_categorical_clinical_events_without_date_dtype_generation(): categorised_codelist = codelist([("X", "Y")], system="ctv3") categorised_codelist.has_categories = True study = StudyDefinition( population=patients.all(), ethnicity=patients.with_these_clinical_events( categorised_codelist, returning="category", find_last_match_in_period=True, ), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "converters": {}, "date_col_for": {}, "dtype": {"ethnicity": "category"}, "parse_dates": [], }
def test_mean_recorded_value_dtype_generation(): test_codelist = codelist(["X"], system="ctv3") study = StudyDefinition( population=patients.all(), bp_sys=patients.mean_recorded_value( test_codelist, on_most_recent_day_of_measurement=True, on_or_before="2020-02-01", ), bp_sys_date_measured=patients.date_of("bp_sys", date_format="YYYY-MM"), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "converters": {"bp_sys_date_measured": "add_day_to_date"}, "dtype": {"bp_sys": "float"}, "date_col_for": {"bp_sys": "bp_sys_date_measured"}, "parse_dates": ["bp_sys_date_measured"], }
def test_make_df_from_expectations_returning_date_using_defaults(): study = StudyDefinition( default_expectations={ "date": {"earliest": "1900-01-01", "latest": "today"}, "rate": "exponential_increase", "incidence": 0.2, }, population=patients.all(), asthma_condition=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", find_first_match_in_period=True, date_format="YYYY-MM-DD", ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert result[~pd.isnull(result["asthma_condition"])].min()[0] < "1960-01-01"
def test_stats_logging_with_message_handle_exception(mock_regex, logger): mock_regex.match.side_effect = Exception("message error") study = StudyDefinition( population=patients.all(), event=patients.with_these_clinical_events(codelist(["A"], "snomed")), ) study.to_dicts() cohortextractor_stats_logs = get_stats_logs(logger.entries) timing_logs = get_logs_by_key(cohortextractor_stats_logs, "timing_id") sqlserver_stats_logs = get_stats_logs(logger.entries, event="sqlserver-stats") # Study runs OK and we still get the normal cohortextractor-stats timing logs assert len(timing_logs) > 0 # sqlserver-stats logs just consist of the error logs for log in sqlserver_stats_logs: assert log["description"] == "Exception in SQL server message handling" assert str(log["exc_info"]) == "message error"
def test_bmi_dtype_generation(): categorised_codelist = codelist([("X", "Y")], system="ctv3") categorised_codelist.has_categories = True study = StudyDefinition( population=patients.all(), bmi=patients.most_recent_bmi( on_or_after="2010-02-01", minimum_age_at_measurement=16, ), bmi_date_measured=patients.date_of("bmi", date_format="YYYY-MM"), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "converters": {"bmi_date_measured": "add_day_to_date"}, "dtype": {"bmi": "float"}, "date_col_for": {"bmi": "bmi_date_measured"}, "parse_dates": ["bmi_date_measured"], }
def test_clinical_events_numeric_value_dtype_generation(): test_codelist = codelist(["X"], system="ctv3") study = StudyDefinition( population=patients.all(), creatinine=patients.with_these_clinical_events( test_codelist, find_last_match_in_period=True, on_or_before="2020-02-01", returning="numeric_value", ), creatinine_date=patients.date_of("creatinine", date_format="YYYY-MM"), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "converters": {"creatinine_date": "add_day_to_date"}, "dtype": {"creatinine": "float"}, "date_col_for": {"creatinine": "creatinine_date"}, "parse_dates": ["creatinine_date"], }
def test_validate_category_expectations(): categorised_codelist = codelist([("X", "Y")], system="ctv3") categorised_codelist.has_categories = True category_definitions = {"A": "sex = 'F'", "B": "sex = 'M'"} study = StudyDefinition(population=patients.all()) # validate against codelists with pytest.raises(ValueError): study.validate_category_expectations( codelist=categorised_codelist, return_expectations={"category": {"ratios": {"X": 1}}}, ) study.validate_category_expectations( codelist=categorised_codelist, return_expectations={"category": {"ratios": {"Y": 1}}}, ) # validate against definitions with pytest.raises(ValueError): study.validate_category_expectations( category_definitions=category_definitions, return_expectations={"category": {"ratios": {"X": 1}}}, ) study.validate_category_expectations( category_definitions=category_definitions, return_expectations={"category": {"ratios": {"A": 1}}}, ) # validate that supplied category definitions override categories # in codelists with pytest.raises(ValueError): study.validate_category_expectations( codelist=categorised_codelist, category_definitions=category_definitions, return_expectations={"category": {"ratios": {"Y": 1}}}, ) study.validate_category_expectations( codelist=categorised_codelist, category_definitions=category_definitions, return_expectations={"category": {"ratios": {"A": 1}}}, )
def test_clinical_events_with_date_dtype_generation(): test_codelist = codelist(["X"], system="ctv3") study = StudyDefinition( population=patients.all(), diabetes=patients.with_these_clinical_events( test_codelist, return_first_date_in_period=True, date_format="YYYY-MM", ), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "converters": { "diabetes": "add_day_to_date" }, "date_col_for": {}, "dtype": {}, "parse_dates": ["diabetes"], }
def test_make_df_from_expectations_with_date_filter(): study = StudyDefinition( population=patients.all(), asthma_condition=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), between=["2001-12-01", "2002-06-01"], returning="date", return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "date": {"earliest": "1900-01-01", "latest": "today"}, }, find_first_match_in_period=True, date_format="YYYY-MM-DD", ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert result.columns == ["asthma_condition"] assert result[~pd.isnull(result["asthma_condition"])].max()[0] <= "2002-06-01"
def test_make_df_from_expectations_partial_default_overrides(): study = StudyDefinition( default_expectations={ "date": {"earliest": "1900-01-01", "latest": "today"}, "rate": "exponential_increase", "incidence": 0.2, }, population=patients.all(), asthma_condition=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", find_first_match_in_period=True, date_format="YYYY", return_expectations={"date": {"latest": "2000-01-01"}}, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert result.asthma_condition.astype("float").max() == 2000
def test_make_df_from_expectations_with_categories_in_codelist_validation(): categorised_codelist = codelist([("X", "Y")], system="ctv3") categorised_codelist.has_categories = True study = StudyDefinition( population=patients.all(), ethnicity=patients.with_these_clinical_events( categorised_codelist, returning="category", return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": {"ratios": {"A": 0.3, "B": 0.7}}, "date": {"earliest": "1900-01-01", "latest": "today"}, }, find_last_match_in_period=True, ), ) population_size = 10000 with pytest.raises(ValueError): study.make_df_from_expectations(population_size)
def test_booleans_correctly_handled_in_dummy_data(tmp_path, file_format): cl = codelist(["12345"], system="snomed") study = StudyDefinition( default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" } }, population=patients.all(), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), ) filename = tmp_path / f"dummy-data.{file_format}" study.to_file(filename, expectations_population=100) if file_format in ("csv", "csv.gz"): df = pandas.read_csv(filename, dtype=str) bools = ("0", "1") elif file_format == "feather": df = pandas.read_feather(filename) bools = (True, False) elif file_format in ("dta", "dta.gz"): df = pandas.read_stata(filename) bools = (0, 1) else: assert False, f"Unhandled format: {file_format}" # Check we've got at least some of each value counts = df.has_event.value_counts() assert counts[bools[0]] > 10 assert counts[bools[1]] > 10
def test_make_df_from_expectations_with_categories(): categorised_codelist = codelist([("1", "A"), ("2", "B")], system="ctv3") categorised_codelist.has_categories = True study = StudyDefinition( population=patients.all(), ethnicity=patients.with_these_clinical_events( categorised_codelist, returning="category", return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": {"ratios": {"A": 0.3, "B": 0.7}}, "date": {"earliest": "1900-01-01", "latest": "today"}, }, find_last_match_in_period=True, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert result.columns == ["ethnicity"] category_counts = result.reset_index().groupby("ethnicity").count() assert category_counts.loc["A", :][0] < category_counts.loc["B", :][0]
) chronic_cardiac_disease_codes = codelist_from_csv( "codelists/opensafely-chronic-cardiac-disease.csv", system="ctv3", column="CTV3ID") chronic_liver_disease_codes = codelist_from_csv( "codelists/opensafely-chronic-liver-disease.csv", system="ctv3", column="CTV3ID") salbutamol_codes = codelist_from_csv( "codelists/opensafely-asthma-inhaler-salbutamol-medication.csv", system="snomed", column="id", ) systolic_blood_pressure_codes = codelist(["2469."], system="ctv3") diastolic_blood_pressure_codes = codelist(["246A."], system="ctv3") study = StudyDefinition( index_date="2020-02-01", # Configure the expectations framework default_expectations={ "date": { "earliest": "1900-01-01", "latest": "index_date" }, "rate": "exponential_increase", }, # This line defines the study population population=patients.registered_with_one_practice_between( "index_date - 1 year", "index_date"),
column="CTV3ID", ) chronic_cardiac_disease_codes = codelist_from_csv( "codelists/opensafely-chronic-cardiac-disease.csv", system="ctv3", column="CTV3ID", ) diabetes_codes = codelist_from_csv( "codelists/opensafely-diabetes.csv", system="ctv3", column="CTV3ID", ) hba1c_new_codes = codelist(["XaPbt", "Xaeze", "Xaezd"], system="ctv3") hba1c_old_codes = codelist(["X772q", "XaERo", "XaERp"], system="ctv3") hypertension_codes = codelist_from_csv( "codelists/opensafely-hypertension.csv", system="ctv3", column="CTV3ID", ) chronic_respiratory_disease_codes = codelist_from_csv( "codelists/opensafely-chronic-respiratory-disease.csv", system="ctv3", column="CTV3ID", ) copd_codes = codelist_from_csv(
from cohortextractor import ( codelist, codelist_from_csv, ) covid_codelist = codelist(["U071", "U072"], system="icd10") confirmed_covid_codelist = codelist(["U071"], system="icd10") suspected_covid_codelist = codelist(["U072"], system="icd10") covid_primary_care_positive_test=codelist_from_csv( "codelists/opensafely-covid-identification-in-primary-care-probable-covid-positive-test.csv", system="ctv3", column="CTV3ID", ) covid_primary_care_code=codelist_from_csv( "codelists/opensafely-covid-identification-in-primary-care-probable-covid-clinical-code.csv", system="ctv3", column="CTV3ID", ) covid_primary_care_sequalae=codelist_from_csv( "codelists/opensafely-covid-identification-in-primary-care-probable-covid-sequelae.csv", system="ctv3", column="CTV3ID", ) covid_primary_care_exposure = codelist_from_csv( "codelists/opensafely-covid-identification-in-primary-care-exposure-to-disease.csv",
def test_vaccination_events_sql(): session = make_session() session.add_all([ # This patient is too old and should be ignored Patient( DateOfBirth="2002-05-04", Vaccinations=[ Vaccination( VaccinationName="Infanrix Hexa", VaccinationDate="2002-06-01", ) ], ), # This patient is too young and should be ignored Patient( DateOfBirth="2019-10-04", Vaccinations=[ Vaccination( VaccinationName="Infanrix Hexa", VaccinationDate="2019-11-04", ) ], ), Patient( DateOfBirth="2018-10-28", Vaccinations=[ Vaccination( VaccinationName="Infanrix Hexa", VaccinationDate="2018-11-01", ) ], MedicationIssues=[ MedicationIssue( MedicationDictionary=MedicationDictionary( DMD_ID="123", MultilexDrug_ID="123"), ConsultationDate="2019-01-01", ), ], CodedEvents=[ CodedEvent(CTV3Code="abc", ConsultationDate="2019-06-01") ], ), ]) session.commit() sql = vaccination_events_sql( date_of_birth_range=("2012-01-01", "2019-06-01"), tpp_vaccination_codelist=codelist( [("Infanrix Hexa", "dtap_hex")], system="tpp_vaccines", ), ctv3_codelist=codelist([("abc", "menb")], system="ctv3"), snomed_codelist=codelist([("123", "rotavirus")], system="snomed"), ) results = sql_to_dicts(sql) result_tuples = [(x["date_given"], x["vaccine_name"]) for x in results] # Results are ordered by patient ID but within each patient's results the # order is arbitrary. To make testing easier we sort them here. result_tuples = sorted(result_tuples) assert result_tuples == [ ("2018-11-01", "dtap_hex"), ("2019-01-01", "rotavirus"), ("2019-06-01", "menb"), ]
from cohortextractor import (codelist, codelist_from_csv) # Vaccination doses first_dose_code = codelist("COVRX1_COD", system="snomed") second_dose_code = codelist("COVRX2_COD", system="snomed") az_first_dose_code = codelist("AZD1RX_COD", system="snomed") az_second_dose_code = codelist("AZD2RX_COD", system="snomed") pf_first_dose_code = codelist("PFD1RX_COD", system="snomed") pf_second_dose_code = codelist("PFD2RX_COD", system="snomed") mo_first_dose_code = codelist("MOD1RX_COD", system="snomed") mo_second_dose_code = codelist("MOD2RX_COD", system="snomed") nx_first_dose_code = codelist("NXD1RX_COD", system="snomed") nx_second_dose_code = codelist("NXD2RX_COD", system="snomed") jn_first_dose_code = codelist("JND1RX_COD", system="snomed") jn_second_dose_code = codelist("JND2RX_COD", system="snomed") gs_first_dose_code = codelist("GSD1RX_COD", system="snomed") gs_second_dose_code = codelist("GSD2RX_COD", system="snomed") vl_first_dose_code = codelist("VLD1RX_COD", system="snomed") vl_second_dose_code = codelist("VLD2RX_COD", system="snomed") # Risk groups chd_code = codelist("CHD_COV_COD", system="snomed") resp_code = codelist("RESP_COV_COD", system="snomed") ckd_code = codelist("CKD_COV_COD", system="snomed")
def test_to_file_with_dummy_data_file(tmp_path, file_format): cl = codelist(["12345"], system="snomed") study = StudyDefinition( default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" } }, population=patients.all(), sex=patients.sex(return_expectations={ "category": { "ratios": { "F": 0.5, "M": 0.5 } }, "rate": "universal", }, ), age=patients.age_as_of( "2020-01-01", return_expectations={ "int": { "distribution": "population_ages" }, "rate": "universal", }, ), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_day=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM-DD", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_month=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_year=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), ) # Generate dummy data using the expectations framework dummy_data_file = tmp_path / f"dummy-data.{file_format}" study.to_file(dummy_data_file, expectations_population=10) # Use this dummy data output_file = tmp_path / f"output.{file_format}" study.to_file(output_file, dummy_data_file=dummy_data_file) # Check results with open(dummy_data_file, "rb") as f: dummy_data = f.read() with open(output_file, "rb") as f: expected_output = f.read() assert dummy_data == expected_output
from cohortextractor import codelist, codelist_from_csv stroke = codelist_from_csv( "codelists/opensafely-incident-non-traumatic-stroke.csv", system="ctv3", column="CTV3ID", ) stroke_hospital = codelist_from_csv( "codelists/opensafely-stroke-secondary-care.csv", system="icd10", column="icd") aki_codes = codelist(["N17", "N170", "N171", "N172", "N178", "N179"], system="icd10") mi_codes = codelist_from_csv( "codelists/opensafely-myocardial-infarction-2.csv", system="ctv3", column="CTV3Code", ) mi_codes_hospital = codelist_from_csv( "codelists/opensafely-cardiovascular-secondary-care.csv", system="icd10", column="icd", category_column="mi", ) heart_failure_codes = codelist_from_csv( "codelists/opensafely-heart-failure.csv", system="ctv3", column="CTV3ID", ) heart_failure_codes_hospital = codelist_from_csv( "codelists/opensafely-cardiovascular-secondary-care.csv", system="icd10",
def study(): return StudyDefinition( default_expectations={ "rate": "exponential_increase", "incidence": 0.2, "date": { "earliest": "1900-01-01", "latest": "today" }, }, population=patients.all(), date_1=patients.with_these_clinical_events( codelist(["A"], system="ctv3"), returning="date", date_format=inconsistent_date_formats.get( "date_1", "YYYY-MM-DD"), ), first_min_date=patients.minimum_of( "date_1", date_2=patients.with_these_clinical_events( codelist(["B"], system="ctv3"), returning="date", date_format=inconsistent_date_formats.get( "date_2", "YYYY-MM-DD"), ), ), second_min_date=patients.minimum_of( date_3=patients.with_these_clinical_events( codelist(["Y"], system="ctv3"), returning="date", date_format=inconsistent_date_formats.get( "date_3", "YYYY-MM-DD"), ), date_4=patients.with_these_clinical_events( codelist(["Z"], system="ctv3"), returning="date", date_format=inconsistent_date_formats.get( "date_4", "YYYY-MM-DD"), ), ), third_min_date=patients.minimum_of( date_5=patients.with_these_clinical_events( codelist(["Y"], system="ctv3"), returning="date", date_format=inconsistent_date_formats.get( "date_5", "YYYY-MM-DD"), ), date_6=patients.with_these_clinical_events( codelist(["Z"], system="ctv3"), returning="date", date_format=inconsistent_date_formats.get( "date_6", "YYYY-MM-DD"), ), ), min_of_second_and_third=patients.minimum_of( "second_min_date", "third_min_date"), min_overall=patients.minimum_of("min_of_second_and_third", "first_min_date"), min_date_1_third_min=patients.minimum_of("date_1", "third_min_date"), )
from cohortextractor import( codelist, codelist_from_csv, ) covid_codelist = codelist(["U071", "U072"], system = "icd10") # https://codelists.opensafely.org/codelist/opensafely/severe-and-profound-learning-disability-flags/44ef542a/ severe_and_profound_learning_disability_codes = codelist_from_csv( "codelists/opensafely-severe-and-profound-learning-disability-flags-44ef542a.csv", system = "ctv3", column = "code", ) # https://codelists.opensafely.org/codelist/opensafely/intellectual-disability-including-downs-syndrome/2020-08-27/ intellectual_disability_including_downs_syndrome_codes = codelist_from_csv( "codelists/opensafely-intellectual-disability-including-downs-syndrome-2020-08-27.csv", system="ctv3", column="CTV3ID", )
def test_stats_logging_tpp_backend(logger): # The query counter is a global at the module level, so it isn't reset between tests # Find the next position (without incrementing it); this is the start of the test's timing logs start_counter = timing_log_counter.next study = StudyDefinition( population=patients.all(), event=patients.with_these_clinical_events(codelist(["A"], "snomed")), ) study.to_dicts() # initial stats expected_initial_study_def_logs = [ # output columns include patient_id, and the 2 variables defined in the # study defniiton # tables - Patient, temp event table for codelist { "output_column_count": 3, "table_count": 2, "table_joins_count": 1 }, { "variable_count": 2 }, { "variables_using_codelist_count": 1 }, { "variable_using_codelist": "event", "codelist_size": 1 }, ] # timing stats # logs in tpp_backend during query execution expected_timing_log_params = [ *_sql_execute_timing_logs( description="Uploading codelist for event", sql="CREATE TABLE #tmp1_event_codelist", timing_id=start_counter, ), *_sql_execute_timing_logs( description=None, sql= "INSERT INTO #tmp1_event_codelist (code, category) VALUES\n[truncated]", timing_id=start_counter + 1, is_truncated=True, ), *_sql_execute_timing_logs( description="Query for event", sql="SELECT * INTO #event", timing_id=start_counter + 2, ), *_sql_execute_timing_logs( description="Query for population", sql="SELECT * INTO #population", timing_id=start_counter + 3, ), *_sql_execute_timing_logs( description="Join all columns for final output", sql="JOIN #event ON #event.patient_id = #population.patient_id", timing_id=start_counter + 4, ), ] assert_stats_logs( logger, expected_initial_study_def_logs, expected_timing_log_params, downloaded=False, )
from pathlib import Path import pytest from cohortextractor import StudyDefinition, codelist, patients from cohortextractor.cohortextractor import SUPPORTED_FILE_FORMATS from cohortextractor.csv_utils import is_csv_filename, write_rows_to_csv from cohortextractor.pandas_utils import dataframe_from_rows, dataframe_to_file from cohortextractor.validate_dummy_data import ( DummyDataValidationError, validate_dummy_data, ) cl = codelist(["12345"], system="snomed") column_definitions = dict( default_expectations={"date": {"earliest": "2020-01-01", "latest": "today"}}, population=patients.all(), sex=patients.sex( return_expectations={ "category": {"ratios": {"F": 0.5, "M": 0.5}}, "rate": "universal", }, ), age=patients.age_as_of( "2020-01-01", return_expectations={ "int": {"distribution": "population_ages"}, "rate": "universal", }, ),
def test_study_definition_dummy_data(tmp_path): study = VaccinationsStudyDefinition( start_date="2017-06-01", get_registered_practice_at_months=[12, 24, 60], tpp_vaccine_codelist=codelist( [ ("Infanrix Hexa", "dtap_hex"), ("Bexsero", "menb"), ("Rotarix", "rotavirus"), ("Prevenar", "pcv"), ("Prevenar - 13", "pcv"), ("Menitorix", "hib_menc"), ("Repevax", "dtap_ipv"), ("Boostrix-IPV", "dtap_ipv"), ("MMRvaxPRO", "mmr"), ("Priorix", "mmr"), ], system="tpp_vaccines", ), ctv3_vaccine_codelist=codelist([("abc", "menb")], system="ctv3"), snomed_vaccine_codelist=codelist([("123", "rotavirus")], system="snomed"), event_washout_period=14, vaccination_schedule=[ "dtap_hex_1", "menb_1", "rotavirus_1", "dtap_hex_2", "pcv_1", "rotavirus_2", "dtap_hex_3", "menb_2", "hib_menc_1", "pcv_2", "mmr_1", "menb_3", "dtap_ipv_1", "mmr_2", ], ) study.to_csv(tmp_path / "dummy.csv", expectations_population=1000) with open(tmp_path / "dummy.csv", newline="") as f: reader = csv.DictReader(f) results = list(reader) assert len(results) == 1000 headers = list(results[0].keys()) assert headers == [ "patient_id", "date_of_birth", "practice_id_at_month_12", "practice_id_at_month_24", "practice_id_at_month_60", "dtap_hex_1", "menb_1", "rotavirus_1", "dtap_hex_2", "pcv_1", "rotavirus_2", "dtap_hex_3", "menb_2", "hib_menc_1", "pcv_2", "mmr_1", "menb_3", "dtap_ipv_1", "mmr_2", ]
def test_study_definition(tmp_path): session = make_session() session.add_all([ # This patient is too old and should be ignored Patient(Patient_ID=1, DateOfBirth="2002-05-04"), Patient( Patient_ID=2, DateOfBirth="2019-01-01", RegistrationHistory=[ RegistrationHistory( StartDate="2019-01-10", EndDate="9999-12-31", Organisation=Organisation(Organisation_ID=678), ), ], ), Patient( Patient_ID=3, DateOfBirth="2018-10-28", RegistrationHistory=[ RegistrationHistory( StartDate="2010-01-01", EndDate="2015-10-01", Organisation=Organisation(Organisation_ID=123), ), # Deliberately overlapping registration histories RegistrationHistory( StartDate="2015-04-01", EndDate="9999-12-31", Organisation=Organisation(Organisation_ID=345), ), ], Vaccinations=[ Vaccination( VaccinationName="Infanrix Hexa", VaccinationDate="2018-11-01", ) ], MedicationIssues=[ MedicationIssue( MedicationDictionary=MedicationDictionary( DMD_ID="123", MultilexDrug_ID="123"), ConsultationDate="2019-01-01", ), ], CodedEvents=[ CodedEvent(CTV3Code="abc", ConsultationDate="2019-06-01") ], ), ]) session.commit() study = VaccinationsStudyDefinition( start_date="2017-06-01", get_registered_practice_at_months=[12, 24, 60], tpp_vaccine_codelist=codelist( [ ("Infanrix Hexa", "dtap_hex"), ("Bexsero", "menb"), ("Rotarix", "rotavirus"), ("Prevenar", "pcv"), ("Prevenar - 13", "pcv"), ("Menitorix", "hib_menc"), ("Repevax", "dtap_ipv"), ("Boostrix-IPV", "dtap_ipv"), ("MMRvaxPRO", "mmr"), ("Priorix", "mmr"), ], system="tpp_vaccines", ), ctv3_vaccine_codelist=codelist([("abc", "menb")], system="ctv3"), snomed_vaccine_codelist=codelist([("123", "rotavirus")], system="snomed"), event_washout_period=14, vaccination_schedule=[ "dtap_hex_1", "menb_1", "rotavirus_1", "dtap_hex_2", "pcv_1", "rotavirus_2", "dtap_hex_3", "menb_2", "hib_menc_1", "pcv_2", "mmr_1", "menb_3", "dtap_ipv_1", "mmr_2", ], ) study.to_csv(tmp_path / "test.csv") with open(tmp_path / "test.csv", newline="") as f: reader = csv.DictReader(f) results = list(reader) assert results == [ { "patient_id": "2", "date_of_birth": "2019-01-01", "practice_id_at_month_12": "678", "practice_id_at_month_24": "678", "practice_id_at_month_60": "678", "dtap_hex_1": "", "menb_1": "", "rotavirus_1": "", "dtap_hex_2": "", "pcv_1": "", "rotavirus_2": "", "dtap_hex_3": "", "menb_2": "", "hib_menc_1": "", "pcv_2": "", "mmr_1": "", "menb_3": "", "dtap_ipv_1": "", "mmr_2": "", }, { "patient_id": "3", "date_of_birth": "2018-10-01", "practice_id_at_month_12": "345", "practice_id_at_month_24": "345", "practice_id_at_month_60": "345", "dtap_hex_1": "2018-11-01", "menb_1": "2019-06-01", "rotavirus_1": "2019-01-01", "dtap_hex_2": "", "pcv_1": "", "rotavirus_2": "", "dtap_hex_3": "", "menb_2": "", "hib_menc_1": "", "pcv_2": "", "mmr_1": "", "menb_3": "", "dtap_ipv_1": "", "mmr_2": "", }, ]
) oad_med_codes = codelist_from_csv( "codelists/opensafely-antidiabetic-drugs.csv", system="snomed", column="id" ) insulin_med_codes = codelist_from_csv( "codelists/opensafely-insulin-medication.csv", system="snomed", column="id" ) hba1c_new_codes = codelist(["XaPbt", "Xaeze", "Xaezd"], system="ctv3") hba1c_old_codes = codelist(["X772q", "XaERo", "XaERp"], system="ctv3") lung_cancer_codes = codelist_from_csv( "codelists/opensafely-lung-cancer.csv", system="ctv3", column="CTV3ID", ) haem_cancer_codes = codelist_from_csv( "codelists/opensafely-haematological-cancer.csv", system="ctv3", column="CTV3ID", ) other_cancer_codes = codelist_from_csv( "codelists/opensafely-cancer-excluding-lung-and-haematological.csv", system="ctv3", column="CTV3ID", )
def test_to_file_with_expectations_population(tmp_path, file_format): cl = codelist([("12345", "foo"), ("67890", "bar")], system="snomed") study = StudyDefinition( default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" } }, population=patients.all(), sex=patients.sex(return_expectations={ "category": { "ratios": { "F": 0.5, "M": 0.5 } }, "rate": "universal", }, ), age=patients.age_as_of( "2020-01-01", return_expectations={ "int": { "distribution": "population_ages" }, "rate": "universal", }, ), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_day=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM-DD", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_month=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_year=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), incomplete_categories=patients.with_these_clinical_events( cl, returning="category", return_expectations={ "category": { "ratios": { "foo": 0.5, "bar": 0.5 } }, # Half the values here should be null "incidence": 0.5, }, ), ) dummy_data_file = tmp_path / f"dummy-data.{file_format}" study.to_file(dummy_data_file, expectations_population=100) # We reuse validate_dummy_data to check that the data generated by the expectations # framework is valid. validate_dummy_data(study.covariate_definitions, dummy_data_file)
from cohortextractor import StudyDefinition, patients, codelist # Define some codelists cardiac_disease_codes = codelist(["56265001", "127337006"], system="snomedct") covid_codes = codelist(["U071", "U072"], system="icd10") study = StudyDefinition( # Configure the expectations framework default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, "rate": "exponential_increase", }, # Define the study population population=patients.registered_with_one_practice_between( "2019-02-01", "2020-02-01"), # Define input variables { age=patients.age_as_of( "2020-02-01", return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" }, }, ), sex=patients.sex(return_expectations={