def test_create_instance(mock_datetime, patient_mapping, fhir_concept_map_identifier): mock_datetime.now.return_value = mockdatetime() resource_mapping = patient_mapping attr_identifier = Attribute("identifier[0].value", columns=[SqlColumn("a", "b")]) attr_birthDate = Attribute("birthDate", columns=[SqlColumn("a", "c")]) attr_maritalStatus = Attribute("maritalStatus.coding[0].code", columns=[SqlColumn("a", "d")]) attr_generalPractitioner = Attribute("generalPractitioner[0].type", static_inputs=["Practitioner"]) attributes = [ attr_identifier, attr_birthDate, attr_maritalStatus, attr_generalPractitioner ] row = { attr_maritalStatus: "D", attr_birthDate: "2000-10-10", attr_identifier: "A", } actual = transform.create_instance(row, resource_mapping, attributes) assert actual == { "meta": { "lastUpdated": "now", "tag": [ { "system": ARKHN_CODE_SYSTEMS.source, "code": patient_mapping["source"]["id"] }, { "system": ARKHN_CODE_SYSTEMS.resource, "code": patient_mapping["id"] }, ], }, "id": actual["id"], "identifier": [{ "value": "A" }], "resourceType": "Patient", "birthDate": "2000-10-10", "maritalStatus": { "coding": [{ "code": "D" }] }, "generalPractitioner": [{ "type": "Practitioner" }], }
def test_create_static_instance(mock_datetime, fhir_concept_map_identifier): mock_datetime.now.return_value = mockdatetime() resource_mapping = { "id": "resource_id", "source": { "id": "source_id" }, "definition": { "type": "instance_type", "kind": "resource", "derivation": "resource" }, } attr_identifier_val = Attribute("identifier[0].value", static_inputs=["static"]) attr_identifier_sys = Attribute("identifier[0].system", static_inputs=["identifier_sys"]) attr_val = Attribute("path.to.attribute", static_inputs=["attribute_val"]) attributes = [attr_identifier_val, attr_identifier_sys, attr_val] actual = transform.create_static_instance(resource_mapping, attributes) assert actual == { "meta": { "lastUpdated": "now", "tag": [ { "system": ARKHN_CODE_SYSTEMS.source, "code": resource_mapping["source"]["id"] }, { "system": ARKHN_CODE_SYSTEMS.resource, "code": resource_mapping["id"] }, ], }, "id": actual["id"], "identifier": [{ "value": "static", "system": "identifier_sys" }], "path": { "to": { "attribute": "attribute_val", } }, "resourceType": "instance_type", }
def test_merge_dataframe(_): attr_name = Attribute("name", columns=[SqlColumn("PATIENTS", "NAME")]) attr_id = Attribute( "id", columns=[SqlColumn("PATIENTS", "ID"), SqlColumn("PATIENTS", "ID2")], static_inputs=["unknown"], merging_script=MergingScript("merge"), ) attr_language = Attribute("language", columns=[SqlColumn("ADMISSIONS", "LANGUAGE")]) attr_admid = Attribute("admid", columns=[SqlColumn("ADMISSIONS", "ID")]) df_columns = pd.MultiIndex.from_tuples([ (attr_name, ("PATIENTS_NAME", "PATIENTS")), (attr_id, ("PATIENTS_ID", "PATIENTS")), (attr_id, ("PATIENTS_ID2", "PATIENTS")), (attr_language, ("ADMISSIONS_LANGUAGE", "ADMISSIONS")), (attr_admid, ("ADMISSIONS_ID", "ADMISSIONS")), ("pk", ("PATIENTS_ID", "PATIENTS")), ]) df = pd.DataFrame( { df_columns[0]: ["bob", "bob", "alice", "bob"], df_columns[1]: ["id1", "id1", "id2", "id3"], df_columns[2]: ["id21", "id21", "id22", "id23"], df_columns[3]: ["lang1", "lang2", "lang3", "lang4"], df_columns[4]: ["hadmid1", "hadmid2", "hadmid3", "hadmid4"], df_columns[5]: ["id1", "id2", "id3", "id4"], }, ) attributes = [attr_name, attr_id, attr_language, attr_admid] primary_key_column = SqlColumn("PATIENTS", "ID") actual = transform.merge_dataframe(df, attributes, primary_key_column) expected = pd.DataFrame( { attr_name: ["bob", "bob", "alice", "bob"], attr_id: ["id1id21merge", "id1id21merge", "id2id22merge", "id3id23merge"], attr_language: ["lang1", "lang2", "lang3", "lang4"], attr_admid: ["hadmid1", "hadmid2", "hadmid3", "hadmid4"], }, ) assert actual.equals(expected)
def test_fetch_values_from_dataframe(): attr_identifier = Attribute("identifier[0].value", columns=[SqlColumn("a", "b")]) attr_birthDate = Attribute("birthDate", columns=[SqlColumn("a", "c")]) attr_maritalStatus = Attribute("maritalStatus.coding[0].code", columns=[SqlColumn("a", "d")]) attribute = attr_birthDate row = { attr_maritalStatus: "D", attr_birthDate: "2000-10-10", attr_identifier: "A", } value = transform.fetch_values_from_dataframe(row, attribute) assert value == "2000-10-10"
def test_squash_rows(): attr_name = Attribute("name", columns=[SqlColumn("PATIENTS", "NAME")]) attr_id = Attribute("id", columns=[SqlColumn("PATIENTS", "ID")]) attr_language = Attribute("language", columns=[SqlColumn("ADMISSIONS", "LANGUAGE")]) attr_admid = Attribute("admid", columns=[SqlColumn("ADMISSIONS", "ID")]) df_columns = pd.MultiIndex.from_tuples([ (attr_name, ("PATIENTS_NAME", "PATIENTS")), (attr_id, ("PATIENTS_ID", "PATIENTS")), (attr_language, ("ADMISSIONS_LANGUAGE", "ADMISSIONS")), (attr_admid, ("ADMISSIONS_ID", "ADMISSIONS")), ]) df = pd.DataFrame( { df_columns[0]: ["bob", "bob", "alice", "bob"], df_columns[1]: ["id1", "id1", "id2", "id3"], df_columns[2]: ["lang1", "lang2", "lang3", "lang4"], df_columns[3]: ["id1", "id2", "id3", "id4"], }, ) squash_rules = ["PATIENTS", [["ADMISSIONS", []]]] actual = transform.squash_rows(df, squash_rules) # Sort to be sure actual and expected are in the same order actual = actual.sort_values(by=df_columns[1]).reset_index(drop=True) expected = pd.DataFrame( { df_columns[0]: ["bob", "alice", "bob"], df_columns[1]: ["id1", "id2", "id3"], df_columns[2]: [("lang1", "lang2"), ("lang3", ), ("lang4", )], df_columns[3]: [("id1", "id2"), ("id3", ), ("id4", )], }, ) # Sort to be sure actual and expected are in the same order expected = expected.sort_values(by=df_columns[1]).reset_index(drop=True) assert actual.equals(expected)
def test_handle_array_attributes(): attr1 = Attribute("attr1", columns=[SqlColumn("a", "b")]) attr2 = Attribute("attr2", columns=[SqlColumn("a", "c")]) row = { attr1: ("A1", "A2", "A3"), attr2: "B", } attributes_in_array = { "path1": attr1, "path2": attr2, } value = transform.handle_array_attributes(attributes_in_array, row) assert value == [ { "path1": "A1", "path2": "B" }, { "path1": "A2", "path2": "B" }, { "path1": "A3", "path2": "B" }, ] # With mismatch in lengths row = { attr1: ("A1", "A2", "A3"), attr2: ("B1", "B2"), } with raises(AssertionError, match="mismatch in array lengths"): transform.handle_array_attributes(attributes_in_array, row)
def test_analyze_mapping(patient_mapping, fhir_concept_map_gender, fhir_concept_map_identifier): analyzer = Analyzer() analyzer.analyze_mapping(patient_mapping) assert analyzer.analysis.attributes == [ Attribute( "identifier[0].value", columns=[SqlColumn("patients", "row_id")], static_inputs=[], merging_script=None, ), Attribute( "deceasedBoolean", columns=[SqlColumn("patients", "expire_flag")], static_inputs=[], merging_script=None, ), Attribute( "generalPractitioner[0].identifier.value", columns=[SqlColumn("icustays", "hadm_id")], static_inputs=[], merging_script=None, ), Attribute( "birthDate", columns=[SqlColumn("patients", "dob")], static_inputs=[], merging_script=None, ), Attribute( "deceasedDateTime", columns=[SqlColumn("patients", "dod")], static_inputs=[], merging_script=None, ), Attribute( "gender", columns=[SqlColumn("patients", "gender")], static_inputs=["unknown"], merging_script=MergingScript("select_first_not_empty"), ), Attribute( "maritalStatus.coding[0].code", columns=[SqlColumn("admissions", "marital_status")], static_inputs=[], merging_script=None, ), Attribute( "generalPractitioner[0].type", columns=[], static_inputs=["Practitioner"], merging_script=None, ), ] assert analyzer.analysis.columns == { SqlColumn("patients", "row_id"), SqlColumn("patients", "gender"), SqlColumn("patients", "dob"), SqlColumn("patients", "dod"), SqlColumn("patients", "expire_flag"), SqlColumn("admissions", "marital_status"), SqlColumn("icustays", "hadm_id"), } assert analyzer.analysis.joins == { SqlJoin(SqlColumn("patients", "subject_id"), SqlColumn("admissions", "subject_id")), SqlJoin(SqlColumn("patients", "subject_id"), SqlColumn("icustays", "subject_id")), }
def test_clean_data(_, fhir_concept_map_code, fhir_concept_map_gender): df = pd.DataFrame( { "PATIENTS_NAME": ["alice", "bob", "charlie"], "PATIENTS_ID": ["id1", "id2", "id3"], "PATIENTS_ID2": ["id21", "id22", "id23"], "ADMISSIONS_LANGUAGE": ["M", "F", "F"], "ADMISSIONS_ID": ["ABC", "DEF", "GHI"], }, ) attr_name = Attribute("name", columns=[ SqlColumn( "PATIENTS", "NAME", cleaning_script=CleaningScript("clean1"), ) ]) attr_id = Attribute( "id", columns=[SqlColumn("PATIENTS", "ID"), SqlColumn("PATIENTS", "ID2")], static_inputs=["null"], ) attr_language = Attribute( "language", columns=[ SqlColumn("ADMISSIONS", "LANGUAGE", concept_map=ConceptMap("id_cm_gender")) ], static_inputs=["val"], ) attr_admid = Attribute( "code", columns=[ SqlColumn( "ADMISSIONS", "ID", cleaning_script=CleaningScript("clean2"), concept_map=ConceptMap("id_cm_code"), ) ], ) attributes = [attr_name, attr_id, attr_language, attr_admid] primary_key_column = SqlColumn("PATIENTS", "ID") cleaned_df = transform.clean_dataframe(df, attributes, primary_key_column) df_columns = pd.MultiIndex.from_tuples([ (attr_name, ("PATIENTS_NAME", "PATIENTS")), (attr_id, ("PATIENTS_ID", "PATIENTS")), (attr_id, ("PATIENTS_ID2", "PATIENTS")), (attr_language, ("ADMISSIONS_LANGUAGE", "ADMISSIONS")), (attr_admid, ("ADMISSIONS_ID", "ADMISSIONS")), ("pk", ("PATIENTS_ID", "PATIENTS")), ]) expected = pd.DataFrame( { df_columns[0]: ["alicecleaned", "bobcleaned", "charliecleaned"], df_columns[1]: ["id1", "id2", "id3"], df_columns[2]: ["id21", "id22", "id23"], df_columns[3]: ["male", "female", "female"], df_columns[4]: ["abc", "def", "ghi"], df_columns[5]: ["id1", "id2", "id3"], }, ) assert cleaned_df.equals(expected)