def test_make_df_from_expectations_with_categories_expression(): study = StudyDefinition( population=patients.all(), category=patients.categorised_as( { "A": "sex = 'F'", "B": "sex = 'M'", "": "DEFAULT" }, sex=patients.sex(), return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": { "ratios": { "A": 0.3, "B": 0.7 } }, "date": { "earliest": "1900-01-01", "latest": "today" }, }, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) value_counts = result.category.value_counts() assert value_counts["A"] < value_counts["B"]
def test_make_df_from_expectations_with_using_dates_as_categories(): study = StudyDefinition( default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, "rate": "exponential_increase", "incidence": 0.2, }, population=patients.all(), eligible_date=patients.categorised_as( { "2020-04-14": "age >= 80", "2020-06-16": "age >= 70 AND age < 80", "2020-08-18": "DEFAULT", }, age=patients.age_as_of("2020-01-01"), return_expectations={ "category": { "ratios": { "2020-04-14": 0.25, "2020-06-16": 0.25, "2020-08-18": 0.5, } }, "incidence": 1, }, ), ) population_size = 100 result = study.make_df_from_expectations(population_size) assert set(result.eligible_date) == set( ["2020-08-18", "2020-06-16", "2020-04-14"])
def test_recursive_definitions_produce_errors(): with pytest.raises(ValueError): StudyDefinition( population=patients.all(), this=patients.satisfying("that = 1"), that=patients.satisfying("this = 1"), )
def test_bmi_dtype_generation(): categorised_codelist = codelist([("X", "Y")], system="ctv3") categorised_codelist.has_categories = True study = StudyDefinition( population=patients.all(), bmi=patients.most_recent_bmi( on_or_after="2010-02-01", minimum_age_at_measurement=16, ), bmi_date_measured=patients.date_of("bmi", date_format="YYYY-MM"), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "converters": { "bmi_date_measured": "add_day_to_date" }, "dtype": { "bmi": "float" }, "date_col_for": { "bmi": "bmi_date_measured" }, "parse_dates": ["bmi_date_measured"], }
def test_make_df_from_expectations_with_number_of_episodes(): study = StudyDefinition( population=patients.all(), episode_count=patients.with_these_clinical_events( codelist(["A", "B", "C"], system="ctv3"), ignore_days_where_these_codes_occur=codelist(["D", "E"], system="ctv3"), returning="number_of_episodes", episode_defined_as="series of events each <= 14 days apart", return_expectations={ "int": { "distribution": "normal", "mean": 4, "stddev": 2 }, "date": { "earliest": "1900-01-01", "latest": "today" }, "incidence": 0.2, }, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert result.columns == ["episode_count"]
def test_make_df_from_expectations_with_mean_recorded_value(): study = StudyDefinition( population=patients.all(), drug_x=patients.mean_recorded_value( codelist(["X"], system="ctv3"), on_most_recent_day_of_measurement=True, return_expectations={ "rate": "exponential_increase", "date": { "earliest": "1900-01-01", "latest": "today" }, "incidence": 0.6, "float": { "distribution": "normal", "mean": 35, "stddev": 10 }, }, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) nonzero_results = result[result["drug_x"] != 0.0] assert abs(35 - int(nonzero_results["drug_x"].mean())) < 5
def test_make_df_from_expectations_with_categories_expression_validation(): study = StudyDefinition( population=patients.all(), category=patients.categorised_as( { "A": "sex = 'F'", "B": "sex = 'M'", "": "DEFAULT" }, sex=patients.sex(), return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": { "ratios": { "A": 0.3, "B": 0.6, "C": 0.1 } }, "date": { "earliest": "1900-01-01", "latest": "today" }, }, ), ) population_size = 10000 with pytest.raises(ValueError): study.make_df_from_expectations(population_size)
def test_script(): import sys from cohortextractor import StudyDefinition, patients study = StudyDefinition( population=patients.all(), sex=patients.sex( return_expectations={ "rate": "universal", "date": { "earliest": "1900-01-01", "latest": "today" }, "category": { "ratios": { "M": 0.49, "F": 0.51 } }, }), ) study.to_csv("/dev/null", expectations_population=10) pyodbc = "yes" if "pyodbc" in sys.modules else "no" ctds = "yes" if "ctds" in sys.modules else "no" print(f"pyodbc: {pyodbc}, ctds: {ctds}")
def test_make_df_from_expectations_with_distribution_and_date(): study = StudyDefinition( population=patients.all(), bmi=patients.most_recent_bmi( on_or_after="2010-02-01", minimum_age_at_measurement=16, return_expectations={ "rate": "exponential_increase", "incidence": 0.6, "float": { "distribution": "normal", "mean": 35, "stddev": 10 }, "date": { "earliest": "1900-01-01", "latest": "today" }, }, ), bmi_date_measured=patients.date_of( "bmi", date_format="YYYY-MM", ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert list(sorted(result.columns)) == ["bmi", "bmi_date_measured"] # Check that the null-valued rows are aligned with each other assert ((result["bmi"] == 0.0) == pd.isnull( result["bmi_date_measured"])).all()
def test_make_df_from_expectations_with_categories_in_codelist_validation(): categorised_codelist = codelist([("X", "Y")], system="ctv3") categorised_codelist.has_categories = True study = StudyDefinition( population=patients.all(), ethnicity=patients.with_these_clinical_events( categorised_codelist, returning="category", return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": { "ratios": { "A": 0.3, "B": 0.7 } }, "date": { "earliest": "1900-01-01", "latest": "today" }, }, find_last_match_in_period=True, ), ) population_size = 10000 with pytest.raises(ValueError): study.make_df_from_expectations(population_size)
def test_stats_logging_with_error(logger): study = StudyDefinition( population=patients.all(), event=patients.with_these_clinical_events(codelist(["A"], "snomed")), ) # insert a deliberate error in the queries study.backend.queries[-1] = "SELECT Foo FROM Bar" with pytest.raises(Exception) as excinfo: study.to_dicts() # The error is raised as expected assert "Invalid object name 'Bar'" in str(excinfo.value) # Timing is logged, with the error state in the end log (sql_log, ) = [ log for log in logger.entries if log.get("sql") == "SELECT Foo FROM Bar" ] (end_log, ) = [ log for log in logger.entries if log.get("timing_id") == sql_log["timing_id"] and log.get("timing") == "stop" ] assert end_log["state"] == "error"
def test_clinical_events_numeric_value_dtype_generation(): test_codelist = codelist(["X"], system="ctv3") study = StudyDefinition( population=patients.all(), creatinine=patients.with_these_clinical_events( test_codelist, find_last_match_in_period=True, on_or_before="2020-02-01", returning="numeric_value", ), creatinine_date=patients.date_of("creatinine", date_format="YYYY-MM"), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "converters": { "creatinine_date": "add_day_to_date" }, "dtype": { "creatinine": "float" }, "date_col_for": { "creatinine": "creatinine_date" }, "parse_dates": ["creatinine_date"], }
def test_make_df_from_expectations_with_categories(): categorised_codelist = codelist([("1", "A"), ("2", "B")], system="ctv3") categorised_codelist.has_categories = True study = StudyDefinition( population=patients.all(), ethnicity=patients.with_these_clinical_events( categorised_codelist, returning="category", return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": { "ratios": { "A": 0.3, "B": 0.7 } }, "date": { "earliest": "1900-01-01", "latest": "today" }, }, find_last_match_in_period=True, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert result.columns == ["ethnicity"] category_counts = result.reset_index().groupby("ethnicity").count() assert category_counts.loc["A", :][0] < category_counts.loc["B", :][0]
def test_categorical_clinical_events_with_date_dtype_generation(): categorised_codelist = codelist([("X", "Y")], system="ctv3") categorised_codelist.has_categories = True study = StudyDefinition( population=patients.all(), ethnicity=patients.with_these_clinical_events( categorised_codelist, returning="category", find_last_match_in_period=True, ), ethnicity_date=patients.date_of("ethnicity"), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "converters": { "ethnicity_date": "add_month_and_day_to_date" }, "date_col_for": { "ethnicity": "ethnicity_date" }, "dtype": { "ethnicity": "category" }, "parse_dates": ["ethnicity_date"], }
def test_make_df_from_expectations_partial_default_overrides(): study = StudyDefinition( default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, "rate": "exponential_increase", "incidence": 0.2, }, population=patients.all(), asthma_condition=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", find_first_match_in_period=True, date_format="YYYY", return_expectations={"date": { "latest": "2000-01-01" }}, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert result.asthma_condition.astype("float").max() == 2000
def test_make_df_from_expectations_with_care_home_status(): study = StudyDefinition( population=patients.all(), is_in_care_home=patients.care_home_status_as_of( "2020-01-01", return_expectations={ "rate": "exponential_increase", "incidence": 0.3, "date": {"earliest": "1900-01-01", "latest": "2020-01-01"}, "bool": True, }, ), care_home_type=patients.care_home_status_as_of( "2020-01-01", categorised_as={ "PN": "IsPotentialCareHome AND LocationRequiresNursing='Y'", "PC": "IsPotentialCareHome", "U": "DEFAULT", }, return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": {"ratios": {"PN": 0.1, "PC": 0.2, "U": 0.7}}, "date": {"earliest": "1900-01-01", "latest": "today"}, }, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) value_counts = result.care_home_type.value_counts() assert value_counts["PN"] < value_counts["U"]
def test_make_df_from_expectations_doesnt_alter_date_defaults(): study = StudyDefinition( default_expectations={ "rate": "exponential_increase", "incidence": 1.0, "date": {"earliest": "1900-01-01", "latest": "today"}, "category": {"ratios": {"M": 0.5, "F": 0.5}}, }, population=patients.all(), with_different_incidence=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", return_expectations={"incidence": 0.2}, include_day=True, ), with_different_date=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", return_expectations={"date": {"earliest": "2015-01-01", "latest": "today"}}, include_day=True, ), with_defaults=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", include_day=True ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) # Regression test: make sure defaults are respected even when they've been overridden assert result.with_defaults.min() < "2015-01-01" assert len(result[pd.isnull(result.with_defaults)]) == 0
def test_unrecognised_database_url_raises_error(monkeypatch): monkeypatch.setenv("DATABASE_URL", "unknown-db://localhost") with pytest.raises(ValueError): StudyDefinition( population=patients.all(), sex=patients.sex(), age=patients.age_as_of("2020-01-01", ), )
def test_export_data_without_database_url_raises_error(tmp_path, monkeypatch): monkeypatch.delenv("DATABASE_URL", raising=False) study = StudyDefinition( population=patients.all(), sex=patients.sex(), age=patients.age_as_of("2020-01-01", ), ) with pytest.raises(RuntimeError): study.to_file(tmp_path / "dummy_data.csv")
def test_sex_dtype_generation(): study = StudyDefinition(population=patients.all(), sex=patients.sex()) result = _converters_to_names(study.pandas_csv_args) assert result == { "dtype": {"sex": "category"}, "converters": {}, "date_col_for": {}, "parse_dates": [], }
def define_study(): StudyDefinition( population=patients.all(), # by default returns last match in period, using visit date value=patients.with_an_ons_cis_record( returning=returning, date_filter_column=date_filter_column, on_or_after=on_or_after, ), )
def test_syntax_errors_in_expressions_are_raised(): with pytest.raises(ValueError): StudyDefinition( population=patients.all(), status=patients.satisfying( "age > 70 AND AND sex = 'M'", sex=patients.sex(), age=patients.age_as_of("2010-01-01"), ), )
def test_column_name_clashes_produce_errors(): with pytest.raises(ValueError): StudyDefinition( population=patients.all(), age=patients.age_as_of("2020-01-01"), status=patients.satisfying( "age > 70 AND sex = 'M'", sex=patients.sex(), age=patients.age_as_of("2010-01-01"), ), )
def test_study_definition_initial_stats_logging(logger): StudyDefinition( default_expectations={ "rate": "exponential_increase", "incidence": 0.2, "date": { "earliest": "1900-01-01", "latest": "today" }, }, population=patients.all(), event_date_1=patients.with_these_clinical_events( codelist(["A"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), event_min_date=patients.minimum_of( "event_date_1", event_date_2=patients.with_these_clinical_events( codelist(["B", "C"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), ), ) assert get_stats_logs(logger.entries) == [ # output columns include patient_id, and the 4 variables defined in the # study defniiton, including event_date_2, which is defined as a parameter to # event_min_date # tables - Patient, temp event table for each codelist { "output_column_count": 5, "table_count": 3, "table_joins_count": 2 }, # variable_count is a count of the top-level variables defined in the study def (i.e. not event_date_2) { "variable_count": 4 }, # 2 variables use a codelist (event_date_1, and the nested event_date_2) { "variables_using_codelist_count": 2 }, # for each variable using a codelist, we log the size of the codelist { "variable_using_codelist": "event_date_1", "codelist_size": 1 }, { "variable_using_codelist": "event_date_2", "codelist_size": 2 }, ]
def test_apply_date_filters_from_definition(): study = StudyDefinition(population=patients.all()) series = np.arange(10) result = list(study.apply_date_filters_from_definition(series, between=[5, 6])) assert result == [5, 6] result = list(study.apply_date_filters_from_definition(series, between=[5, None])) assert result == [5, 6, 7, 8, 9] result = list(study.apply_date_filters_from_definition(series, between=[None, 2])) assert result == [0, 1, 2]
def test_make_df_from_binary_default_outcome(): study = StudyDefinition( population=patients.all(), died=patients.died_from_any_cause( return_expectations={ "date": {"earliest": "1900-01-01", "latest": "today"}, "incidence": 0.1, } ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert len(result[~pd.isnull(result.died)]) == 0.1 * population_size
def test_clinical_events_with_year_date_dtype_generation(): test_codelist = codelist(["X"], system="ctv3") study = StudyDefinition( population=patients.all(), diabetes=patients.with_these_clinical_events(test_codelist, returning="date"), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "converters": {"diabetes": "add_month_and_day_to_date"}, "date_col_for": {}, "dtype": {}, "parse_dates": ["diabetes"], }
def test_age_dtype_generation(): study = StudyDefinition( # This line defines the study population population=patients.all(), age=patients.age_as_of("2020-02-01"), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "dtype": {"age": "Int64"}, "parse_dates": [], "date_col_for": {}, "converters": {}, }
def test_make_df_no_categories_validation_when_no_categories_in_definition(): study = StudyDefinition( population=patients.all(), sex=patients.sex( return_expectations={ "rate": "universal", "date": {"earliest": "1900-01-01", "latest": "today"}, "category": {"ratios": {"M": 0.49, "F": 0.51}}, } ), ) population_size = 10000 # Just ensuring no exception is raised study.make_df_from_expectations(population_size)
def test_address_dtype_generation(): study = StudyDefinition( # This line defines the study population population=patients.all(), rural_urban=patients.address_as_of( "2020-02-01", returning="rural_urban_classification" ), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "dtype": {"rural_urban": "category"}, "parse_dates": [], "date_col_for": {}, "converters": {}, }