def covid_test_date_v1(session: Session, test_table, dest_test_table, dest_field_name='test_date', dest_field_flags_name='test_date_valid'): """ Infer the test date from 'date_taken_specific', 'date_taken_between_start' and 'date_taken_between_end' columns. :param session: The Exetera session instance. :param test_table: The tests dataframe which contains 'date_taken_specific', 'date_taken_between_start' and 'date_taken_between_end' columns.. :param dest_test_table: The destination dataframe to write the result to. :param dest_field_name: The name of the result date column. :param dest_field_flags_name: The name of the column to store the flat indicates if the date is set or inferred. """ exact = session.get(test_table['date_taken_specific']) exact_ = exact.data[:] between_start_ = session.get( test_table['date_taken_between_start']).data[:] between_end_ = session.get(test_table['date_taken_between_end']).data[:] # flag dates where neither exact or between_start are set test_date_valid = (exact_ == 0.0) & (between_start_ != 0.0) & (between_end_ != 0.0) &\ (between_end_ >= between_start_) test_date_valid = test_date_valid | ((exact_ != 0.0) & (between_start_ == 0.0) & (between_end_ == 0.0)) test_date = np.where(exact_ != 0.0, exact_, between_start_ + (between_end_ - between_start_) / 2) exact.create_like(dest_test_table, dest_field_name).data.write(test_date) session.create_numeric(dest_test_table, dest_field_flags_name, 'bool').data.write(test_date_valid)
def generate_dataset(length, val_column_count): rng = np.random.RandomState(12345678) id_base = 0 #1000000000 mapping = [0, 1, 2, 1] s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf: with utils.Timer('creating a_ids'): a_ids = generate_a_ids(length, id_base) a_ids_f = s.create_numeric(hf, 'a_ids', 'int64') a_ids_f.data.write(a_ids) del a_ids print('creating a_vals') # all_a_val_fields = list() for v in range(val_column_count): with utils.Timer("creating a_vals[{}]".format(v)): a_vals = generate_a_vals(length, 0, 100, rng) a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64') a_vals_f.data.write(a_vals) # all_a_val_fields.append(a_vals_f) del a_vals with utils.Timer('creating b_ids'): b_ids = generate_b_ids(length, id_base, mapping) b_ids_f = s.create_numeric(hf, 'b_ids', 'int64') b_ids_f.data.write(b_ids) del b_ids
def iterator_test_1(length): a_ids, a_vals, b_ids = generate_dataset_1(length) s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf: wa_vals = s.create_numeric(hf, 'a_vals', 'int32') wa_vals.data.write(a_vals) wa_vals2 = s.get(hf['a_vals']) print(fast_sum(iter(ops.data_iterator(wa_vals2))))
def new_to_hdf5(vcount): s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf: for name in ('fk_ids', 'ids'): print('importing "{}"'.format(name)) n = np.load('/home/ben/covid/{}.npy'.format(name)) df = s.create_numeric(hf, name, 'int64') df.data.write(n) for v in range(vcount): print('importing "right_data_{}"'.format(v)) n = np.load('/home/ben/covid/right_data_{}.npy'.format(v)) df = s.create_numeric(hf, 'right_data_{}'.format(v), 'int32') df.data.write(n)
def read_fields_from_hdf5(file_name, field_count): fields = ('id', 'created_at', 'updated_at', 'version', 'country_code', 'reported_by_another', 'same_household_as_reporter', 'contact_additional_studies', 'year_of_birth', 'height_cm', 'weight_kg', 'gender', 'race_other', 'ethnicity', 'profile_attributes_updated_at', 'has_diabetes') print(len(fields)) s = Session() with h5py.File(file_name, 'r') as hf: with utils.Timer("reading {} fields from dataset".format(field_count)): for f in range(field_count): field = s.get(hf['patients'][fields[f]]) if isinstance(field, flds.IndexedStringField): indices = field.indices[:] values = field.values[:] else: data = field.data[:]
def new_hs_test(vcount): s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf: with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest: print(hf.keys()) a_ids_f = s.get(hf['fk_ids']) b_ids_f = s.get(hf['ids']) all_b_val_fields = list() for v in range(vcount): b_vals_f = s.create_numeric(dest, 'left_data_{}'.format(v), 'int32') all_b_val_fields.append(b_vals_f) a_to_b = s.create_numeric(dest, 'a_to_b', 'int64') all_a_val_fields = list() for v in range(vcount): a_vals_f = s.get(hf['right_data_{}'.format(v)]) all_a_val_fields.append(a_vals_f) print("running test") t0 = time.time() # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True, # left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,)) print(a_ids_f.data[:100]) print(b_ids_f.data[:100]) print(all_a_val_fields[0].data[:100]) s.ordered_merge_left(a_ids_f, b_ids_f, left_to_right_map=a_to_b, right_unique=True, right_field_sources=tuple(all_a_val_fields), left_field_sinks=tuple(all_b_val_fields)) print(a_to_b.data[:100]) results = s.merge_left(a_ids_f, b_ids_f, right_fields=tuple(all_a_val_fields)) elapsed = time.time() - t0 print("total:", elapsed) print(all_b_val_fields[0].data[:100]) print(results[0][:100])
def test_test_counts_per_patient_v1_positive_test(self): bio = BytesIO() with Session() as s: pids_ = np.asarray([b'b', b'c', b'd', b'f', b'h', b'i']) t_pids_ = np.asarray([ b'a', b'a', b'b', b'b', b'b', b'c', b'c', b'e', b'e', b'f', b'g', b'h', b'i', b'i' ]) src = s.open_dataset(bio, 'w', 'src') ptnts = src.create_group('patients') tests = src.create_group('tests') s.create_fixed_string(ptnts, 'id', 1).data.write(pids_) s.create_fixed_string(tests, 'patient_id', 1).data.write(t_pids_) alg.test_counts_per_patient_v1(s, ptnts, tests, ptnts, 'counts') counts_ = s.get(ptnts['counts']).data[:] self.assertListEqual([3, 2, 0, 1, 1, 2], counts_.tolist())
def test_test_type_from_mechanism_v1_fields(self): bio = BytesIO() with h5py.File(bio, 'w') as hf: s = Session() t_mech = np.asarray([-1, 0, 1, 2, 3, 4, -1, -1, 5, 6, 7, -1]) t_mech_f = s.create_numeric(hf, "t_mech", 'int8') t_mech_f.data.write(t_mech) t_mech_freetext =\ np.asarray(["bloodxyz", "", "", "", "", "", "swabxyz", "selfxyz", "", "", "", "fingerxyz"]) t_mech_freetext_f = s.create_indexed_string(hf, "t_mech_freetext") t_mech_freetext_f.data.write(t_mech_freetext) pcr1 = s.create_numeric(hf, 'pcr1', 'bool') pcr2 = s.create_numeric(hf, 'pcr2', 'bool') pcr3 = s.create_numeric(hf, 'pcr3', 'bool') atb1 = s.create_numeric(hf, 'atb1', 'bool') atb2 = s.create_numeric(hf, 'atb2', 'bool') atb3 = s.create_numeric(hf, 'atb3', 'bool') test_type_from_mechanism_v1(s, t_mech_f, t_mech_freetext_f, pcr1, pcr2, pcr3, atb1, atb2, atb3) self.assertTrue( np.array_equal( pcr1.data[:], np.asarray([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], dtype=np.bool))) self.assertTrue( np.array_equal( pcr2.data[:], np.asarray([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=np.bool))) self.assertTrue( np.array_equal( pcr3.data[:], np.asarray([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=np.bool))) self.assertTrue( np.array_equal( atb1.data[:], np.asarray([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0], dtype=np.bool))) self.assertTrue( np.array_equal( atb2.data[:], np.asarray([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.bool))) self.assertTrue( np.array_equal( atb3.data[:], np.asarray([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=np.bool)))
def first_test_date_per_patient(session: Session, patient_table, test_table, test_date_name, dest_patient_table, dest_patient_name): """ Filter the first date of test performed for each patient id. :param session: The Exetera session instance. :param patient_table: The patient dataframe. :param test_table: The tests dataframe. :param test_date_name: The name of the test dataframe, not used. :param dest_patient_table: The destination dataframe to store the results. :param dest_patient_name: The name of the destination field to store the results. """ pid = 'id' pids = session.get(patient_table[pid]) pids_ = pids.data[:] if not ops.is_ordered(pids.data[:]): raise ValueError("The patient table must be ordered by '{}'".format(pid)) t_pid = 'patient_id' t_pids = session.get(test_table[t_pid]) t_pids_ = t_pids.data[:] if not ops.is_ordered(t_pids_): raise ValueError("The test table must be ordered by '{}'".format(t_pid)) # collapse the test data by patient_id and get the counts cats = session.get(test_table['created_at']) spans_ = session.get_spans(t_pids_) s_t_pids_ = session.apply_spans_first(spans_, t_pids_) counts_ = session.apply_spans_first(spans_, cats) # merge the counts for the test table into the patient table dest = session.create_numeric(dest_patient_table, dest_patient_name, 'int32') session.ordered_merge_left(left_on=pids_, right_on=s_t_pids_, right_field_sources=(counts_,), left_field_sinks=(dest,), left_unique=True, right_unique=True)
def test_test_type_from_mechanism_v1_numpy(self): s = Session() t_mech = np.asarray([-1, 0, 1, 2, 3, 4, -1, -1, 5, 6, 7, -1]) t_mech_freetext =\ np.asarray(["bloodxyz", "", "", "", "", "", "swabxyz", "selfxyz", "", "", "", "fingerxyz"]) pcr1 = np.zeros(len(t_mech), dtype=np.bool) pcr2 = np.zeros(len(t_mech), dtype=np.bool) pcr3 = np.zeros(len(t_mech), dtype=np.bool) atb1 = np.zeros(len(t_mech), dtype=np.bool) atb2 = np.zeros(len(t_mech), dtype=np.bool) atb3 = np.zeros(len(t_mech), dtype=np.bool) test_type_from_mechanism_v1(s, t_mech, t_mech_freetext, pcr1, pcr2, pcr3, atb1, atb2, atb3) self.assertTrue( np.array_equal( pcr1, np.asarray([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], dtype=np.bool))) self.assertTrue( np.array_equal( pcr2, np.asarray([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=np.bool))) self.assertTrue( np.array_equal( pcr3, np.asarray([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=np.bool))) self.assertTrue( np.array_equal( atb1, np.asarray([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0], dtype=np.bool))) self.assertTrue( np.array_equal( atb2, np.asarray([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.bool))) self.assertTrue( np.array_equal( atb3, np.asarray([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=np.bool)))
def test_write_then_read_numeric(self): from exetera.core.session import Session from exetera.core import fields from exetera.core.utils import Timer s = Session() bio = BytesIO() with h5py.File(bio, 'w') as hf: np.random.seed(12345678) values = np.random.randint(low=0, high=1000000, size=100000000) fields.numeric_field_constructor(s, hf, 'a', 'int32') a = fields.NumericField(s, hf['a'], write_enabled=True) a.data.write(values) total = np.sum(a.data[:]) self.assertEqual(49997540637149, total) a.data[:] = a.data[:] * 2 total = np.sum(a.data[:]) self.assertEqual(99995081274298, total)
def test_write_then_read_categorical(self): from exetera.core.session import Session from exetera.core import fields from exetera.core.utils import Timer s = Session() bio = BytesIO() with h5py.File(bio, 'w') as hf: np.random.seed(12345678) values = np.random.randint(low=0, high=3, size=100000000) fields.categorical_field_constructor(s, hf, 'a', 'int8', { 'foo': 0, 'bar': 1, 'boo': 2 }) a = fields.CategoricalField(s, hf['a'], write_enabled=True) a.data.write(values) total = np.sum(a.data[:]) self.assertEqual(99987985, total)
def test_concatenate_daily(self): ids = np.asarray([ 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c' ], dtype='S1') days = np.asarray([ '2020-05-06', '2020-05-06', '2020-05-07', '2020-06-02', '2020-06-02', '2020-08-01', '2020-08-20', '2020-09-05', '2020-04-10', '2020-04-11', '2020-04-11', '2020-04-11', '2020-04-11', '2020-04-11', '2020-04-11' ], dtype='S10') idf = [ 'a', "'b'", 'what', 'some, information', 'x', '', 'foo', 'flop', "'dun'", "'mun'", "'race, track?'", '', "for, too", 'z', 'now!' ] nums = np.asarray( [5, 6, 3, 2, 1, 10, 230, 3, 5, -20, -4, 2, 6, 100, 40], dtype=np.int32) bio = BytesIO() with Session() as s: ds = s.open_dataset(bio, 'w', 'ds') src = ds.create_group('src') ids_f = s.create_fixed_string(src, 'patient_id', 1) ids_f.data.write(ids) days_f = s.create_fixed_string(src, 'created_at_day', 10) days_f.data.write(days) idf_f = s.create_indexed_string(src, 'idf') idf_f.data.write(idf) nums_f = s.create_numeric(src, 'nums', 'int32') nums_f.data.write(nums) dest = ds.create_group('dest') merge_daily_assessments_v1(s, src, dest) print(dest.keys()) print(s.get(dest['idf']).data[:])
def test_write_then_read_indexed_string(self): from exetera.core.session import Session from exetera.core import fields from exetera.core.utils import Timer s = Session() bio = BytesIO() with h5py.File(bio, 'w') as hf: np.random.seed(12345678) values = np.random.randint(low=0, high=4, size=200000) svalues = [''.join(['x'] * v) for v in values] fields.indexed_string_field_constructor(s, hf, 'a', 8) a = fields.IndexedStringField(s, hf['a'], write_enabled=True) a.data.write(svalues) total = np.unique(a.data[:]) self.assertListEqual(['', 'x', 'xx', 'xxx'], total.tolist()) strs = a.data[:] strs = [s + 'y' for s in strs] a.data.clear() a.data.write(strs) # print(strs[:10]) self.assertListEqual([ 'xxxy', 'xxy', 'xxxy', 'y', 'xy', 'y', 'xxxy', 'xxxy', 'xy', 'y' ], strs[:10]) # print(a.indices[:10]) self.assertListEqual([0, 4, 7, 11, 12, 14, 15, 19, 23, 25], a.indices[:10].tolist()) # print(a.values[:10]) self.assertListEqual( [120, 120, 120, 121, 120, 120, 121, 120, 120, 120], a.values[:10].tolist()) # print(a.data[:10]) self.assertListEqual([ 'xxxy', 'xxy', 'xxxy', 'y', 'xy', 'y', 'xxxy', 'xxxy', 'xy', 'y' ], a.data[:10])
def test_write_then_read_fixed_string(self): from exetera.core.session import Session from exetera.core import fields from exetera.core.utils import Timer s = Session() bio = BytesIO() with h5py.File(bio, 'w') as hf: np.random.seed(12345678) values = np.random.randint(low=0, high=4, size=1000000) svalues = [b''.join([b'x'] * v) for v in values] fields.fixed_string_field_constructor(s, hf, 'a', 8) a = fields.FixedStringField(s, hf['a'], write_enabled=True) a.data.write(svalues) total = np.unique(a.data[:]) self.assertListEqual([b'', b'x', b'xx', b'xxx'], total.tolist()) a.data[:] = np.core.defchararray.add(a.data[:], b'y') self.assertListEqual([ b'xxxy', b'xxy', b'xxxy', b'y', b'xy', b'y', b'xxxy', b'xxxy', b'xy', b'y' ], a.data[:10].tolist())
def test_covid_test_date_v1_positive_test(self): bio = BytesIO() with Session() as s: # t_pids_ = np.asarray([b'a', b'a', b'b', b'b', b'b', b'c', b'c', b'e', # b'e', b'f', b'g', b'h', b'i', b'i']) t_dates_exact = np.asarray([ 0.0, dt(2020, 10, 12).timestamp(), dt(2020, 6, 2).timestamp(), 0.0, dt(2021, 1, 30).timestamp(), 0.0, # 5 0.0, 0.0, dt(2020, 8, 10).timestamp(), # 8 dt(2020, 12, 1).timestamp(), dt(2020, 9, 2).timestamp(), 0.0 # 11 ]) t_dates_from = np.asarray([ dt(2020, 5, 12).timestamp(), 0.0, 0.0, dt(2020, 9, 2).timestamp(), 0.0, 0.0, # 5 dt(2020, 7, 16).timestamp(), 0.0, dt(2021, 8, 8).timestamp(), # 8 0.0, dt(2020, 8, 10).timestamp(), dt(2020, 11, 4).timestamp(), # 11 ]) t_dates_to = np.asarray([ dt(2020, 5, 17).timestamp(), 0.0, 0.0, dt(2020, 9, 3).timestamp(), 0.0, 0.0, # 5 0.0, dt(2020, 6, 20).timestamp(), 0.0, # 8 dt(2020, 12, 19).timestamp(), dt(2020, 10, 5).timestamp(), dt(2020, 11, 3).timestamp() # 11 ]) print(dt(2020, 11, 3).timestamp()) src = s.open_dataset(bio, 'w', 'src') tests = src.create_group('tests') s.create_timestamp(tests, 'date_taken_specific').data.write(t_dates_exact) s.create_timestamp( tests, 'date_taken_between_start').data.write(t_dates_from) s.create_timestamp(tests, 'date_taken_between_end').data.write(t_dates_to) covid_test_date_v1(s, tests, tests) print(s.get(tests['test_date']).data[:]) print(s.get(tests['test_date_valid']).data[:])
for i in range(len(starts)): substrs = text[starts[i]:ends[i]].tobytes().decode() # if ' - ' in substrs: # print(substrs) substrs = replace_multi_with_str("#!,\"(){}[].:;", substrs) substrs = [s_.strip() for s_ in substrs.split() if len(s_) > 0] for s in substrs: if s in words_to_check: total_count += 1 break print(total_count) with h5py.File('/home/ben/covid/ds_20200901_full.hdf5', 'r') as hf: with h5py.File('/home/ben/covid/ds_20200901_othersymp.hdf5', 'w') as tmp: s = Session() print([k for k in hf['patients'].keys() if 'result' in k]) old_test = s.get(hf['patients']['max_assessment_test_result']).data[:] new_test = s.get(hf['patients']['max_test_result']).data[:] test_results = np.where((old_test == 3) | (new_test == 4), 2, 0) test_results = np.where( (test_results == 0) & ((old_test == 2) | (new_test == 3)), 1, test_results) p_test_results = s.create_numeric(tmp, 'p_test_results', 'int8') p_test_results.data.write(test_results) print("overall tests:", np.unique(test_results, return_counts=True)) other = s.get(hf['assessments']['other_symptoms']) cc = s.get(hf['assessments']['country_code']).data[:] otherstart = other.indices[:-1]
import time from exetera.core.load_schema import load_schema from exetera.core.session import Session from exetera.core.journal import journal_test_harness schema_fn = '/home/ben/covid/covid_schema.json' old_fn = '/home/ben/covid/ds_20200801_base.hdf5' new_fn = '/home/ben/covid/ds_20200901_base.hdf5' dest_fn = '/home/ben/covid/ds_journal.hdf5' with open(schema_fn) as f: schema = load_schema(f) journal_test_harness(Session(), schema, old_fn, new_fn, dest_fn)
def hs_test_1(length, val_column_count): # rng = np.random.RandomState(12345678) # id_base = 1000000000 # mapping = [0, 1, 2, 1] s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf: with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest: # print('creating a_ids') # a_ids = generate_a_ids(length, id_base) # a_ids_f = s.create_numeric(hf, 'a_ids', 'int64') # a_ids_f.data.write(a_ids) # del a_ids # # print('creating a_vals') # # all_a_val_fields = list() # for v in range(val_column_count): # a_vals = generate_a_vals(length, 0, 100, rng) # a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64') # a_vals_f.data.write(a_vals) # # all_a_val_fields.append(a_vals_f) # del a_vals # # print('creating b_ids') # b_ids = generate_b_ids(length, id_base, mapping) # b_ids_f = s.create_numeric(hf, 'b_ids', 'int64') # b_ids_f.data.write(b_ids) # del b_ids a_ids_f = s.get(hf['a_ids']) b_ids_f = s.get(hf['b_ids']) all_b_val_fields = list() for v in range(val_column_count): b_vals_f = s.create_numeric(dest, 'b_vals_{}'.format(v), 'int32') all_b_val_fields.append(b_vals_f) a_to_b = s.create_numeric(dest, 'a_to_b', 'int64') all_a_val_fields = list() for v in range(val_column_count): a_vals_f = s.get(hf['a_vals_{}'.format(v)]) all_a_val_fields.append(a_vals_f) print("running test") t0 = time.time() # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True, # left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,)) print(b_ids_f.data[:100]) print(a_ids_f.data[:100]) s.ordered_merge_left(b_ids_f, a_ids_f, right_field_sources=tuple(all_a_val_fields), left_field_sinks=tuple(all_b_val_fields), left_to_right_map=a_to_b, right_unique=True) print(a_to_b.data[:100]) results = s.merge_left(b_ids_f, a_ids_f, right_fields=tuple(all_a_val_fields)) elapsed = time.time() - t0 print(elapsed) print(all_b_val_fields[0].data[:100]) print(results[0][:100])
def postprocess(dataset, destination, timestamp=None, flags=None): if flags is None: flags = set() do_daily_asmts = 'daily' in flags has_patients = 'patients' in dataset.keys() has_assessments = 'assessments' in dataset.keys() has_tests = 'tests' in dataset.keys() has_diet = 'diet' in dataset.keys() sort_enabled = lambda x: True process_enabled = lambda x: True sort_patients = sort_enabled(flags) and True sort_assessments = sort_enabled(flags) and True sort_tests = sort_enabled(flags) and True sort_diet = sort_enabled(flags) and True make_assessment_patient_id_fkey = process_enabled(flags) and True year_from_age = process_enabled(flags) and True clean_weight_height_bmi = process_enabled(flags) and True health_worker_with_contact = process_enabled(flags) and True clean_temperatures = process_enabled(flags) and True check_symptoms = process_enabled(flags) and True create_daily = process_enabled(flags) and do_daily_asmts make_patient_level_assessment_metrics = process_enabled(flags) and True make_patient_level_daily_assessment_metrics = process_enabled( flags) and do_daily_asmts make_new_test_level_metrics = process_enabled(flags) and True make_diet_level_metrics = True make_healthy_diet_index = True # ds = DataStore(timestamp=timestamp) s = Session() # patients ================================================================ sorted_patients_src = None if has_patients: patients_src = dataset['patients'] write_mode = 'write' if 'patients' not in destination.keys(): patients_dest = s.get_or_create_group(destination, 'patients') sorted_patients_src = patients_dest # Patient sort # ============ if sort_patients: duplicate_filter = \ persistence.filter_duplicate_fields(s.get(patients_src['id']).data[:]) for k in patients_src.keys(): t0 = time.time() r = s.get(patients_src[k]) w = r.create_like(patients_dest, k) s.apply_filter(duplicate_filter, r, w) print(f"'{k}' filtered in {time.time() - t0}s") print(np.count_nonzero(duplicate_filter == True), np.count_nonzero(duplicate_filter == False)) sort_keys = ('id', ) s.sort_on(patients_dest, patients_dest, sort_keys, write_mode='overwrite') # Patient processing # ================== if year_from_age: log("year of birth -> age; 18 to 90 filter") t0 = time.time() yobs = s.get(patients_dest['year_of_birth']) yob_filter = s.get(patients_dest['year_of_birth_valid']) age = s.create_numeric(patients_dest, 'age', 'uint32') age_filter = s.create_numeric(patients_dest, 'age_filter', 'bool') age_16_to_90 = s.create_numeric(patients_dest, '16_to_90_years', 'bool') print('year_of_birth:', patients_dest['year_of_birth']) for k in patients_dest['year_of_birth'].attrs.keys(): print(k, patients_dest['year_of_birth'].attrs[k]) calculate_age_from_year_of_birth_v1(yobs, yob_filter, 16, 90, age, age_filter, age_16_to_90, 2020) log(f"completed in {time.time() - t0}") print('age_filter count:', np.sum(patients_dest['age_filter']['values'][:])) print('16_to_90_years count:', np.sum(patients_dest['16_to_90_years']['values'][:])) if clean_weight_height_bmi: log("height / weight / bmi; standard range filters") t0 = time.time() weights_clean = s.create_numeric(patients_dest, 'weight_kg_clean', 'float32') weights_filter = s.create_numeric(patients_dest, '40_to_200_kg', 'bool') heights_clean = s.create_numeric(patients_dest, 'height_cm_clean', 'float32') heights_filter = s.create_numeric(patients_dest, '110_to_220_cm', 'bool') bmis_clean = s.create_numeric(patients_dest, 'bmi_clean', 'float32') bmis_filter = s.create_numeric(patients_dest, '15_to_55_bmi', 'bool') weight_height_bmi_v1(s, 40, 200, 110, 220, 15, 55, None, None, None, None, patients_dest['weight_kg'], patients_dest['weight_kg_valid'], patients_dest['height_cm'], patients_dest['height_cm_valid'], patients_dest['bmi'], patients_dest['bmi_valid'], weights_clean, weights_filter, None, heights_clean, heights_filter, None, bmis_clean, bmis_filter, None) log(f"completed in {time.time() - t0}") if health_worker_with_contact: with utils.Timer("health_worker_with_contact field"): #writer = ds.get_categorical_writer(patients_dest, 'health_worker_with_contact', 'int8') combined_hcw_with_contact_v1( s, s.get(patients_dest['healthcare_professional']), s.get(patients_dest['contact_health_worker']), s.get(patients_dest['is_carer_for_community']), patients_dest, 'health_worker_with_contact') # assessments ============================================================= sorted_assessments_src = None if has_assessments: assessments_src = dataset['assessments'] if 'assessments' not in destination.keys(): assessments_dest = s.get_or_create_group(destination, 'assessments') sorted_assessments_src = assessments_dest if sort_assessments: sort_keys = ('patient_id', 'created_at') with utils.Timer("sorting assessments"): s.sort_on(assessments_src, assessments_dest, sort_keys) if has_patients: if make_assessment_patient_id_fkey: print( "creating 'assessment_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) assessment_patient_ids =\ s.get(sorted_assessments_src['patient_id']) assessment_patient_id_fkey =\ s.create_numeric(assessments_dest, 'assessment_patient_id_fkey', 'int64') s.get_index(patient_ids.data[:], assessment_patient_ids.data[:], assessment_patient_id_fkey) print(f"completed in {time.time() - t0}s") if clean_temperatures: print("clean temperatures") t0 = time.time() temps = s.get(sorted_assessments_src['temperature']) temp_units = s.get(sorted_assessments_src['temperature_unit']) temps_valid = s.get( sorted_assessments_src['temperature_valid']) dest_temps = temps.create_like(assessments_dest, 'temperature_c_clean') dest_temps_valid = temps_valid.create_like( assessments_dest, 'temperature_35_to_42_inclusive') dest_temps_modified = temps_valid.create_like( assessments_dest, 'temperature_modified') validate_temperature_v1(s, 35.0, 42.0, temps, temp_units, temps_valid, dest_temps, dest_temps_valid, dest_temps_modified) print(f"temperature cleaning done in {time.time() - t0}") if check_symptoms: print('check inconsistent health_status') t0 = time.time() check_inconsistent_symptoms_v1(s, sorted_assessments_src, assessments_dest) print(time.time() - t0) # tests =================================================================== if has_tests: if sort_tests: tests_src = dataset['tests'] tests_dest = s.get_or_create_group(destination, 'tests') sort_keys = ('patient_id', 'created_at') s.sort_on(tests_src, tests_dest, sort_keys) # diet ==================================================================== if has_diet: diet_src = dataset['diet'] if 'diet' not in destination.keys(): diet_dest = s.get_or_create_group(destination, 'diet') sorted_diet_src = diet_dest if sort_diet: sort_keys = ('patient_id', 'display_name', 'id') s.sort_on(diet_src, diet_dest, sort_keys) if has_assessments: if do_daily_asmts: daily_assessments_dest = s.get_or_create_group( destination, 'daily_assessments') # post process patients # TODO: need an transaction table print(patients_src.keys()) print(dataset['assessments'].keys()) print(dataset['tests'].keys()) # write_mode = 'overwrite' write_mode = 'write' # Daily assessments # ================= if has_assessments: if create_daily: print("generate daily assessments") patient_ids = s.get(sorted_assessments_src['patient_id']) created_at_days = s.get(sorted_assessments_src['created_at_day']) raw_created_at_days = created_at_days.data[:] if 'assessment_patient_id_fkey' in assessments_src.keys(): patient_id_index = assessments_src[ 'assessment_patient_id_fkey'] else: patient_id_index = assessments_dest[ 'assessment_patient_id_fkey'] patient_id_indices = s.get(patient_id_index) raw_patient_id_indices = patient_id_indices.data[:] print("Calculating patient id index spans") t0 = time.time() patient_id_index_spans = s.get_spans( fields=(raw_patient_id_indices, raw_created_at_days)) print( f"Calculated {len(patient_id_index_spans)-1} spans in {time.time() - t0}s" ) print("Applying spans to 'health_status'") t0 = time.time() default_behavour_overrides = { 'id': s.apply_spans_last, 'patient_id': s.apply_spans_last, 'patient_index': s.apply_spans_last, 'created_at': s.apply_spans_last, 'created_at_day': s.apply_spans_last, 'updated_at': s.apply_spans_last, 'updated_at_day': s.apply_spans_last, 'version': s.apply_spans_max, 'country_code': s.apply_spans_first, 'date_test_occurred': None, 'date_test_occurred_guess': None, 'date_test_occurred_day': None, 'date_test_occurred_set': None, } for k in sorted_assessments_src.keys(): t1 = time.time() reader = s.get(sorted_assessments_src[k]) if k in default_behavour_overrides: apply_span_fn = default_behavour_overrides[k] if apply_span_fn is not None: apply_span_fn( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") else: print(f" Skipping field {k}") else: if isinstance(reader, fields.CategoricalField): s.apply_spans_max( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") elif isinstance(reader, rw.IndexedStringReader): s.apply_spans_concat( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") elif isinstance(reader, rw.NumericReader): s.apply_spans_max( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") else: print(f" No function for {k}") print(f"apply_spans completed in {time.time() - t0}s") if has_patients and has_assessments: if make_patient_level_assessment_metrics: if 'assessment_patient_id_fkey' in assessments_dest: src = assessments_dest['assessment_patient_id_fkey'] else: src = assessments_src['assessment_patient_id_fkey'] assessment_patient_id_fkey = s.get(src) # generate spans from the assessment-space patient_id foreign key spans = s.get_spans(field=assessment_patient_id_fkey.data[:]) ids = s.get(patients_dest['id']) print('calculate assessment counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'assessment_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated assessment counts per patient in {time.time() - t0}" ) print('calculate first assessment days per patient') t0 = time.time() reader = s.get(sorted_assessments_src['created_at_day']) writer = s.create_fixed_string(patients_dest, 'first_assessment_day', 10) aggregated_counts = s.apply_spans_first(spans, reader) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated first assessment days per patient in {time.time() - t0}" ) print('calculate last assessment days per patient') t0 = time.time() reader = s.get(sorted_assessments_src['created_at_day']) writer = s.create_fixed_string(patients_dest, 'last_assessment_day', 10) aggregated_counts = s.apply_spans_last(spans, reader) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated last assessment days per patient in {time.time() - t0}" ) print('calculate maximum assessment test result per patient') t0 = time.time() reader = s.get(sorted_assessments_src['tested_covid_positive']) writer = reader.create_like(patients_dest, 'max_assessment_test_result') max_result_value = s.apply_spans_max(spans, reader) s.join(ids, assessment_patient_id_fkey, max_result_value, writer, spans) print( f"calculated maximum assessment test result in {time.time() - t0}" ) if has_assessments and do_daily_asmts and make_patient_level_daily_assessment_metrics: print( "creating 'daily_assessment_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) daily_assessment_patient_ids =\ s.get(daily_assessments_dest['patient_id']) daily_assessment_patient_id_fkey =\ s.create_numeric(daily_assessments_dest, 'daily_assessment_patient_id_fkey', 'int64') s.get_index(patient_ids, daily_assessment_patient_ids, daily_assessment_patient_id_fkey) print(f"completed in {time.time() - t0}s") spans = s.get_spans(field=s.get( daily_assessments_dest['daily_assessment_patient_id_fkey'])) print('calculate daily assessment counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'daily_assessment_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) daily_assessment_patient_id_fkey =\ s.get(daily_assessments_dest['daily_assessment_patient_id_fkey']) s.join(ids, daily_assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated daily assessment counts per patient in {time.time() - t0}" ) if has_tests and make_new_test_level_metrics: print( "creating 'test_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) test_patient_ids = s.get(tests_dest['patient_id']) test_patient_id_fkey = s.create_numeric(tests_dest, 'test_patient_id_fkey', 'int64') s.get_index(patient_ids, test_patient_ids, test_patient_id_fkey) test_patient_id_fkey = s.get(tests_dest['test_patient_id_fkey']) spans = s.get_spans(field=test_patient_id_fkey) print(f"completed in {time.time() - t0}s") print('calculate test_counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'test_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) s.join(ids, test_patient_id_fkey, aggregated_counts, writer, spans) print(f"calculated test counts per patient in {time.time() - t0}") print('calculate test_result per patient') t0 = time.time() test_results = s.get(tests_dest['result']) writer = test_results.create_like(patients_dest, 'max_test_result') aggregated_results = s.apply_spans_max(spans, test_results) s.join(ids, test_patient_id_fkey, aggregated_results, writer, spans) print(f"calculated max_test_result per patient in {time.time() - t0}") if has_diet and make_diet_level_metrics: with utils.Timer("Making patient-level diet questions count", new_line=True): d_pids_ = s.get(diet_dest['patient_id']).data[:] d_pid_spans = s.get_spans(d_pids_) d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_) d_pid_counts = s.apply_spans_count(d_pid_spans) p_diet_counts = s.create_numeric(patients_dest, 'diet_counts', 'int32') s.merge_left(left_on=s.get(patients_dest['id']).data[:], right_on=d_distinct_pids, right_fields=(d_pid_counts, ), right_writers=(p_diet_counts, ))
#pat_asymp = ds.apply_indices(spans_asymp, ds.get_reader(out_pos['patient_id'])) pat_asymp = out_pos['patient_id'].apply_index(spans_asymp) filt_asymp = prst.foreign_key_is_in_primary_key(pat_asymp.data[:], src_pat['id'].data[:]) out_pat_asymp = output.create_dataframe('patient_asymp') for k in list_interest: src_pat[k].create_like(out_pat_asymp, k) src_pat[k].apply_filter(filt_asymp, target=out_pat_asymp[k]) # reader = ds.get_reader(src_pat[k]) # writer = reader.get_writer(out_pat_asymp, k, ts) # ds.apply_filter(filt_asymp, reader, writer) # dict_pata = {} # for k in list_interest: # values = out_pat_asymp[k].data[:] # dict_pata[k] = values # # df_pata = pd.DataFrame.from_dict(dict_pata) # df_pata.to_csv(path + '/PositiveAsymp_PatDetails.csv') save_df_to_csv(out_pat_asymp, 'PositiveAsymp_PatDetails.csv') if __name__ == "__main__": source_file = '/home/jd21/data/post.h5' dst_file = '/home/jd21/data/May17_processed_mrslt.hdf5' with Session() as s: source = s.open_dataset(source_file, 'r', 'src') output = s.open_dataset(dst_file, 'w', 'out') merging_results(s, source, output)
self.values[3] |= 1 if o else 0 self.values[4] |= 0 if m else 1 self.values[5] |= 1 if m else 0 self.values[6] |= 0 if c else 1 self.values[7] |= 1 if c else 0 self.values[8] |= 0 if d else 1 self.values[9] |= 1 if d else 0 self.values[10] |= 0 if z else 1 self.values[11] |= 1 if z else 0 src_file = '/home/ben/covid/ds_20200929_full.hdf5' dest_file = '/home/ben/covid/ds_diet_tmp.hdf5' with h5py.File(src_file, 'r') as hf: with h5py.File(dest_file, 'w') as dest: s = Session() ptnts = hf['patients'] print(hf['diet'].keys()) diet = hf['diet'] p_ids_ = s.get(hf['patients']['id']).data[:] d_pids_ = s.get(hf['diet']['patient_id']).data[:] d_pid_spans = s.get_spans(d_pids_) d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_) d_pid_counts = s.apply_spans_count(d_pid_spans) print(np.unique(d_pid_counts, return_counts=True)) p_diet_counts_new = s.create_numeric(dest, 'diet_counts_new', 'int32') dcs = s.merge_left(left_on=p_ids_, right_on=d_distinct_pids, right_fields=(d_pid_counts, ),