def postprocess(dataset, destination, timestamp=None, flags=None): if flags is None: flags = set() do_daily_asmts = 'daily' in flags has_patients = 'patients' in dataset.keys() has_assessments = 'assessments' in dataset.keys() has_tests = 'tests' in dataset.keys() has_diet = 'diet' in dataset.keys() sort_enabled = lambda x: True process_enabled = lambda x: True sort_patients = sort_enabled(flags) and True sort_assessments = sort_enabled(flags) and True sort_tests = sort_enabled(flags) and True sort_diet = sort_enabled(flags) and True make_assessment_patient_id_fkey = process_enabled(flags) and True year_from_age = process_enabled(flags) and True clean_weight_height_bmi = process_enabled(flags) and True health_worker_with_contact = process_enabled(flags) and True clean_temperatures = process_enabled(flags) and True check_symptoms = process_enabled(flags) and True create_daily = process_enabled(flags) and do_daily_asmts make_patient_level_assessment_metrics = process_enabled(flags) and True make_patient_level_daily_assessment_metrics = process_enabled( flags) and do_daily_asmts make_new_test_level_metrics = process_enabled(flags) and True make_diet_level_metrics = True make_healthy_diet_index = True # ds = DataStore(timestamp=timestamp) s = Session() # patients ================================================================ sorted_patients_src = None if has_patients: patients_src = dataset['patients'] write_mode = 'write' if 'patients' not in destination.keys(): patients_dest = s.get_or_create_group(destination, 'patients') sorted_patients_src = patients_dest # Patient sort # ============ if sort_patients: duplicate_filter = \ persistence.filter_duplicate_fields(s.get(patients_src['id']).data[:]) for k in patients_src.keys(): t0 = time.time() r = s.get(patients_src[k]) w = r.create_like(patients_dest, k) s.apply_filter(duplicate_filter, r, w) print(f"'{k}' filtered in {time.time() - t0}s") print(np.count_nonzero(duplicate_filter == True), np.count_nonzero(duplicate_filter == False)) sort_keys = ('id', ) s.sort_on(patients_dest, patients_dest, sort_keys, write_mode='overwrite') # Patient processing # ================== if year_from_age: log("year of birth -> age; 18 to 90 filter") t0 = time.time() yobs = s.get(patients_dest['year_of_birth']) yob_filter = s.get(patients_dest['year_of_birth_valid']) age = s.create_numeric(patients_dest, 'age', 'uint32') age_filter = s.create_numeric(patients_dest, 'age_filter', 'bool') age_16_to_90 = s.create_numeric(patients_dest, '16_to_90_years', 'bool') print('year_of_birth:', patients_dest['year_of_birth']) for k in patients_dest['year_of_birth'].attrs.keys(): print(k, patients_dest['year_of_birth'].attrs[k]) calculate_age_from_year_of_birth_v1(yobs, yob_filter, 16, 90, age, age_filter, age_16_to_90, 2020) log(f"completed in {time.time() - t0}") print('age_filter count:', np.sum(patients_dest['age_filter']['values'][:])) print('16_to_90_years count:', np.sum(patients_dest['16_to_90_years']['values'][:])) if clean_weight_height_bmi: log("height / weight / bmi; standard range filters") t0 = time.time() weights_clean = s.create_numeric(patients_dest, 'weight_kg_clean', 'float32') weights_filter = s.create_numeric(patients_dest, '40_to_200_kg', 'bool') heights_clean = s.create_numeric(patients_dest, 'height_cm_clean', 'float32') heights_filter = s.create_numeric(patients_dest, '110_to_220_cm', 'bool') bmis_clean = s.create_numeric(patients_dest, 'bmi_clean', 'float32') bmis_filter = s.create_numeric(patients_dest, '15_to_55_bmi', 'bool') weight_height_bmi_v1(s, 40, 200, 110, 220, 15, 55, None, None, None, None, patients_dest['weight_kg'], patients_dest['weight_kg_valid'], patients_dest['height_cm'], patients_dest['height_cm_valid'], patients_dest['bmi'], patients_dest['bmi_valid'], weights_clean, weights_filter, None, heights_clean, heights_filter, None, bmis_clean, bmis_filter, None) log(f"completed in {time.time() - t0}") if health_worker_with_contact: with utils.Timer("health_worker_with_contact field"): #writer = ds.get_categorical_writer(patients_dest, 'health_worker_with_contact', 'int8') combined_hcw_with_contact_v1( s, s.get(patients_dest['healthcare_professional']), s.get(patients_dest['contact_health_worker']), s.get(patients_dest['is_carer_for_community']), patients_dest, 'health_worker_with_contact') # assessments ============================================================= sorted_assessments_src = None if has_assessments: assessments_src = dataset['assessments'] if 'assessments' not in destination.keys(): assessments_dest = s.get_or_create_group(destination, 'assessments') sorted_assessments_src = assessments_dest if sort_assessments: sort_keys = ('patient_id', 'created_at') with utils.Timer("sorting assessments"): s.sort_on(assessments_src, assessments_dest, sort_keys) if has_patients: if make_assessment_patient_id_fkey: print( "creating 'assessment_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) assessment_patient_ids =\ s.get(sorted_assessments_src['patient_id']) assessment_patient_id_fkey =\ s.create_numeric(assessments_dest, 'assessment_patient_id_fkey', 'int64') s.get_index(patient_ids.data[:], assessment_patient_ids.data[:], assessment_patient_id_fkey) print(f"completed in {time.time() - t0}s") if clean_temperatures: print("clean temperatures") t0 = time.time() temps = s.get(sorted_assessments_src['temperature']) temp_units = s.get(sorted_assessments_src['temperature_unit']) temps_valid = s.get( sorted_assessments_src['temperature_valid']) dest_temps = temps.create_like(assessments_dest, 'temperature_c_clean') dest_temps_valid = temps_valid.create_like( assessments_dest, 'temperature_35_to_42_inclusive') dest_temps_modified = temps_valid.create_like( assessments_dest, 'temperature_modified') validate_temperature_v1(s, 35.0, 42.0, temps, temp_units, temps_valid, dest_temps, dest_temps_valid, dest_temps_modified) print(f"temperature cleaning done in {time.time() - t0}") if check_symptoms: print('check inconsistent health_status') t0 = time.time() check_inconsistent_symptoms_v1(s, sorted_assessments_src, assessments_dest) print(time.time() - t0) # tests =================================================================== if has_tests: if sort_tests: tests_src = dataset['tests'] tests_dest = s.get_or_create_group(destination, 'tests') sort_keys = ('patient_id', 'created_at') s.sort_on(tests_src, tests_dest, sort_keys) # diet ==================================================================== if has_diet: diet_src = dataset['diet'] if 'diet' not in destination.keys(): diet_dest = s.get_or_create_group(destination, 'diet') sorted_diet_src = diet_dest if sort_diet: sort_keys = ('patient_id', 'display_name', 'id') s.sort_on(diet_src, diet_dest, sort_keys) if has_assessments: if do_daily_asmts: daily_assessments_dest = s.get_or_create_group( destination, 'daily_assessments') # post process patients # TODO: need an transaction table print(patients_src.keys()) print(dataset['assessments'].keys()) print(dataset['tests'].keys()) # write_mode = 'overwrite' write_mode = 'write' # Daily assessments # ================= if has_assessments: if create_daily: print("generate daily assessments") patient_ids = s.get(sorted_assessments_src['patient_id']) created_at_days = s.get(sorted_assessments_src['created_at_day']) raw_created_at_days = created_at_days.data[:] if 'assessment_patient_id_fkey' in assessments_src.keys(): patient_id_index = assessments_src[ 'assessment_patient_id_fkey'] else: patient_id_index = assessments_dest[ 'assessment_patient_id_fkey'] patient_id_indices = s.get(patient_id_index) raw_patient_id_indices = patient_id_indices.data[:] print("Calculating patient id index spans") t0 = time.time() patient_id_index_spans = s.get_spans( fields=(raw_patient_id_indices, raw_created_at_days)) print( f"Calculated {len(patient_id_index_spans)-1} spans in {time.time() - t0}s" ) print("Applying spans to 'health_status'") t0 = time.time() default_behavour_overrides = { 'id': s.apply_spans_last, 'patient_id': s.apply_spans_last, 'patient_index': s.apply_spans_last, 'created_at': s.apply_spans_last, 'created_at_day': s.apply_spans_last, 'updated_at': s.apply_spans_last, 'updated_at_day': s.apply_spans_last, 'version': s.apply_spans_max, 'country_code': s.apply_spans_first, 'date_test_occurred': None, 'date_test_occurred_guess': None, 'date_test_occurred_day': None, 'date_test_occurred_set': None, } for k in sorted_assessments_src.keys(): t1 = time.time() reader = s.get(sorted_assessments_src[k]) if k in default_behavour_overrides: apply_span_fn = default_behavour_overrides[k] if apply_span_fn is not None: apply_span_fn( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") else: print(f" Skipping field {k}") else: if isinstance(reader, fields.CategoricalField): s.apply_spans_max( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") elif isinstance(reader, rw.IndexedStringReader): s.apply_spans_concat( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") elif isinstance(reader, rw.NumericReader): s.apply_spans_max( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") else: print(f" No function for {k}") print(f"apply_spans completed in {time.time() - t0}s") if has_patients and has_assessments: if make_patient_level_assessment_metrics: if 'assessment_patient_id_fkey' in assessments_dest: src = assessments_dest['assessment_patient_id_fkey'] else: src = assessments_src['assessment_patient_id_fkey'] assessment_patient_id_fkey = s.get(src) # generate spans from the assessment-space patient_id foreign key spans = s.get_spans(field=assessment_patient_id_fkey.data[:]) ids = s.get(patients_dest['id']) print('calculate assessment counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'assessment_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated assessment counts per patient in {time.time() - t0}" ) print('calculate first assessment days per patient') t0 = time.time() reader = s.get(sorted_assessments_src['created_at_day']) writer = s.create_fixed_string(patients_dest, 'first_assessment_day', 10) aggregated_counts = s.apply_spans_first(spans, reader) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated first assessment days per patient in {time.time() - t0}" ) print('calculate last assessment days per patient') t0 = time.time() reader = s.get(sorted_assessments_src['created_at_day']) writer = s.create_fixed_string(patients_dest, 'last_assessment_day', 10) aggregated_counts = s.apply_spans_last(spans, reader) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated last assessment days per patient in {time.time() - t0}" ) print('calculate maximum assessment test result per patient') t0 = time.time() reader = s.get(sorted_assessments_src['tested_covid_positive']) writer = reader.create_like(patients_dest, 'max_assessment_test_result') max_result_value = s.apply_spans_max(spans, reader) s.join(ids, assessment_patient_id_fkey, max_result_value, writer, spans) print( f"calculated maximum assessment test result in {time.time() - t0}" ) if has_assessments and do_daily_asmts and make_patient_level_daily_assessment_metrics: print( "creating 'daily_assessment_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) daily_assessment_patient_ids =\ s.get(daily_assessments_dest['patient_id']) daily_assessment_patient_id_fkey =\ s.create_numeric(daily_assessments_dest, 'daily_assessment_patient_id_fkey', 'int64') s.get_index(patient_ids, daily_assessment_patient_ids, daily_assessment_patient_id_fkey) print(f"completed in {time.time() - t0}s") spans = s.get_spans(field=s.get( daily_assessments_dest['daily_assessment_patient_id_fkey'])) print('calculate daily assessment counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'daily_assessment_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) daily_assessment_patient_id_fkey =\ s.get(daily_assessments_dest['daily_assessment_patient_id_fkey']) s.join(ids, daily_assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated daily assessment counts per patient in {time.time() - t0}" ) if has_tests and make_new_test_level_metrics: print( "creating 'test_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) test_patient_ids = s.get(tests_dest['patient_id']) test_patient_id_fkey = s.create_numeric(tests_dest, 'test_patient_id_fkey', 'int64') s.get_index(patient_ids, test_patient_ids, test_patient_id_fkey) test_patient_id_fkey = s.get(tests_dest['test_patient_id_fkey']) spans = s.get_spans(field=test_patient_id_fkey) print(f"completed in {time.time() - t0}s") print('calculate test_counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'test_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) s.join(ids, test_patient_id_fkey, aggregated_counts, writer, spans) print(f"calculated test counts per patient in {time.time() - t0}") print('calculate test_result per patient') t0 = time.time() test_results = s.get(tests_dest['result']) writer = test_results.create_like(patients_dest, 'max_test_result') aggregated_results = s.apply_spans_max(spans, test_results) s.join(ids, test_patient_id_fkey, aggregated_results, writer, spans) print(f"calculated max_test_result per patient in {time.time() - t0}") if has_diet and make_diet_level_metrics: with utils.Timer("Making patient-level diet questions count", new_line=True): d_pids_ = s.get(diet_dest['patient_id']).data[:] d_pid_spans = s.get_spans(d_pids_) d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_) d_pid_counts = s.apply_spans_count(d_pid_spans) p_diet_counts = s.create_numeric(patients_dest, 'diet_counts', 'int32') s.merge_left(left_on=s.get(patients_dest['id']).data[:], right_on=d_distinct_pids, right_fields=(d_pid_counts, ), right_writers=(p_diet_counts, ))
print("overall tests:", np.unique(test_results, return_counts=True)) other = s.get(hf['assessments']['other_symptoms']) cc = s.get(hf['assessments']['country_code']).data[:] otherstart = other.indices[:-1] otherend = other.indices[1:] ofilter = otherend - otherstart > 0 print("ofilter:", ofilter.sum(), len(ofilter)) cfilter = cc == b"GB" print("cfilter:", cfilter.sum(), len(cfilter)) filter_ = ofilter & cfilter print("filter_:", filter_.sum(), len(filter_)) filt_asmt = tmp.create_group('filt_assessments') filt_other_symptoms = other.create_like(filt_asmt, 'other_symptoms') s.apply_filter(filter_, other, filt_other_symptoms) patient_id = s.get(hf['assessments']['patient_id']) filt_patient_id = patient_id.create_like(filt_asmt, 'patient_id') s.apply_filter(filter_, patient_id, filt_patient_id) print('filtered symptoms len =', len(filt_other_symptoms.data)) with utils.Timer("merging test_results"): p_to_a = s.create_numeric(tmp, 'p_to_a', 'int64') a_test_results = s.create_numeric(tmp, 'a_test_results', 'int8') s.ordered_merge_left(left_on=s.get( tmp['filt_assessments']['patient_id']), right_on=s.get(hf['patients']['id']), left_field_sources=(p_test_results, ), left_field_sinks=(a_test_results, ), left_to_right_map=p_to_a, right_unique=True)
# for i in range(len(p_ids_)): # if p_diet_counts_[i] != ddtest[p_ids_[i]]: # mismatches += 1 # print(mismatches) p_diet_counts_ = s.get(ptnts['diet_counts']).data[:] p_filter = p_diet_counts_ > 0 # a_diet_counts = s.merge_left(left_on=d_pids_, # right_on=p_ids_, # right_fields=(p_diet_counts_,))[0] print('patient-based') print(np.unique(s.get(ptnts['vs_none']).data[:], return_counts=True)) vs_none = np.where(s.get(ptnts['vs_none']).data[:] == 2, 1, 0) vs_none = s.apply_filter(p_filter, vs_none) print('vs_none:', np.unique(vs_none, return_counts=True)) vs_omega_3 = np.where(s.get(ptnts['vs_omega_3']).data[:] == 2, 1, 0) vs_omega_3 = s.apply_filter(p_filter, vs_omega_3) print('vs_omega_3:', np.unique(vs_omega_3, return_counts=True)) vs_multivitamins = np.where( s.get(ptnts['vs_multivitamins']).data[:] == 2, 1, 0) vs_multivitamins = s.apply_filter(p_filter, vs_multivitamins) print('vs_multivitamins:', np.unique(vs_multivitamins, return_counts=True)) vs_vitamin_c = np.where( s.get(ptnts['vs_vitamin_c']).data[:] == 2, 1, 0) vs_vitamin_c = s.apply_filter(p_filter, vs_vitamin_c) print('vs_vitamin_c:', np.unique(vs_vitamin_c, return_counts=True)) vs_vitamin_d = np.where( s.get(ptnts['vs_vitamin_d']).data[:] == 2, 1, 0)