Python Session 예제들, exetera.core.session.Session Python 예제들

예제 #1

0

파일 보기

def covid_test_date_v1(session: Session,
                       test_table,
                       dest_test_table,
                       dest_field_name='test_date',
                       dest_field_flags_name='test_date_valid'):
    """
    Infer the test date from 'date_taken_specific', 'date_taken_between_start' and 'date_taken_between_end' columns.

    :param session: The Exetera session instance.
    :param test_table: The tests dataframe which contains 'date_taken_specific', 'date_taken_between_start' and
        'date_taken_between_end' columns..
    :param dest_test_table: The destination dataframe to write the result to.
    :param dest_field_name: The name of the result date column.
    :param dest_field_flags_name: The name of the column to store the flat indicates if the date is set or inferred.
    """
    exact = session.get(test_table['date_taken_specific'])
    exact_ = exact.data[:]
    between_start_ = session.get(
        test_table['date_taken_between_start']).data[:]
    between_end_ = session.get(test_table['date_taken_between_end']).data[:]

    # flag dates where neither exact or between_start are set
    test_date_valid = (exact_ == 0.0) & (between_start_ != 0.0) & (between_end_ != 0.0) &\
                      (between_end_ >= between_start_)
    test_date_valid = test_date_valid | ((exact_ != 0.0) &
                                         (between_start_ == 0.0) &
                                         (between_end_ == 0.0))

    test_date = np.where(exact_ != 0.0, exact_,
                         between_start_ + (between_end_ - between_start_) / 2)

    exact.create_like(dest_test_table, dest_field_name).data.write(test_date)
    session.create_numeric(dest_test_table, dest_field_flags_name,
                           'bool').data.write(test_date_valid)

예제 #2

0

파일 보기

def generate_dataset(length, val_column_count):
    rng = np.random.RandomState(12345678)
    id_base = 0  #1000000000
    mapping = [0, 1, 2, 1]
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf:
        with utils.Timer('creating a_ids'):
            a_ids = generate_a_ids(length, id_base)
            a_ids_f = s.create_numeric(hf, 'a_ids', 'int64')
            a_ids_f.data.write(a_ids)
            del a_ids

        print('creating a_vals')
        # all_a_val_fields = list()
        for v in range(val_column_count):
            with utils.Timer("creating a_vals[{}]".format(v)):
                a_vals = generate_a_vals(length, 0, 100, rng)
                a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64')
                a_vals_f.data.write(a_vals)
                # all_a_val_fields.append(a_vals_f)
                del a_vals

        with utils.Timer('creating b_ids'):
            b_ids = generate_b_ids(length, id_base, mapping)
            b_ids_f = s.create_numeric(hf, 'b_ids', 'int64')
            b_ids_f.data.write(b_ids)
            del b_ids

예제 #3

0

파일 보기

def iterator_test_1(length):
    a_ids, a_vals, b_ids = generate_dataset_1(length)
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf:
        wa_vals = s.create_numeric(hf, 'a_vals', 'int32')
        wa_vals.data.write(a_vals)

        wa_vals2 = s.get(hf['a_vals'])
        print(fast_sum(iter(ops.data_iterator(wa_vals2))))

예제 #4

0

파일 보기

def new_to_hdf5(vcount):
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf:
        for name in ('fk_ids', 'ids'):
            print('importing "{}"'.format(name))
            n = np.load('/home/ben/covid/{}.npy'.format(name))
            df = s.create_numeric(hf, name, 'int64')
            df.data.write(n)

        for v in range(vcount):
            print('importing "right_data_{}"'.format(v))
            n = np.load('/home/ben/covid/right_data_{}.npy'.format(v))
            df = s.create_numeric(hf, 'right_data_{}'.format(v), 'int32')
            df.data.write(n)

예제 #5

0

파일 보기

def read_fields_from_hdf5(file_name, field_count):
    fields = ('id', 'created_at', 'updated_at', 'version', 'country_code',
              'reported_by_another', 'same_household_as_reporter',
              'contact_additional_studies', 'year_of_birth', 'height_cm',
              'weight_kg', 'gender', 'race_other', 'ethnicity',
              'profile_attributes_updated_at', 'has_diabetes')
    print(len(fields))
    s = Session()
    with h5py.File(file_name, 'r') as hf:
        with utils.Timer("reading {} fields from dataset".format(field_count)):
            for f in range(field_count):
                field = s.get(hf['patients'][fields[f]])
                if isinstance(field, flds.IndexedStringField):
                    indices = field.indices[:]
                    values = field.values[:]
                else:
                    data = field.data[:]

예제 #6

0

파일 보기

def new_hs_test(vcount):
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf:
        with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest:

            print(hf.keys())

            a_ids_f = s.get(hf['fk_ids'])
            b_ids_f = s.get(hf['ids'])

            all_b_val_fields = list()
            for v in range(vcount):
                b_vals_f = s.create_numeric(dest, 'left_data_{}'.format(v),
                                            'int32')
                all_b_val_fields.append(b_vals_f)

            a_to_b = s.create_numeric(dest, 'a_to_b', 'int64')

            all_a_val_fields = list()
            for v in range(vcount):
                a_vals_f = s.get(hf['right_data_{}'.format(v)])
                all_a_val_fields.append(a_vals_f)

            print("running test")
            t0 = time.time()
            # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True,
            #                      left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,))
            print(a_ids_f.data[:100])
            print(b_ids_f.data[:100])
            print(all_a_val_fields[0].data[:100])
            s.ordered_merge_left(a_ids_f,
                                 b_ids_f,
                                 left_to_right_map=a_to_b,
                                 right_unique=True,
                                 right_field_sources=tuple(all_a_val_fields),
                                 left_field_sinks=tuple(all_b_val_fields))
            print(a_to_b.data[:100])
            results = s.merge_left(a_ids_f,
                                   b_ids_f,
                                   right_fields=tuple(all_a_val_fields))
            elapsed = time.time() - t0
            print("total:", elapsed)
            print(all_b_val_fields[0].data[:100])
            print(results[0][:100])

예제 #7

0

파일 보기

파일: test_test_counts_per_patient.py 프로젝트: deng113jie/ExeTeraCovid

 def test_test_counts_per_patient_v1_positive_test(self):
     bio = BytesIO()
     with Session() as s:
         pids_ = np.asarray([b'b', b'c', b'd', b'f', b'h', b'i'])
         t_pids_ = np.asarray([
             b'a', b'a', b'b', b'b', b'b', b'c', b'c', b'e', b'e', b'f',
             b'g', b'h', b'i', b'i'
         ])
         src = s.open_dataset(bio, 'w', 'src')
         ptnts = src.create_group('patients')
         tests = src.create_group('tests')
         s.create_fixed_string(ptnts, 'id', 1).data.write(pids_)
         s.create_fixed_string(tests, 'patient_id', 1).data.write(t_pids_)
         alg.test_counts_per_patient_v1(s, ptnts, tests, ptnts, 'counts')
         counts_ = s.get(ptnts['counts']).data[:]
         self.assertListEqual([3, 2, 0, 1, 1, 2], counts_.tolist())

예제 #8

0

파일 보기

    def test_test_type_from_mechanism_v1_fields(self):
        bio = BytesIO()
        with h5py.File(bio, 'w') as hf:
            s = Session()
            t_mech = np.asarray([-1, 0, 1, 2, 3, 4, -1, -1, 5, 6, 7, -1])
            t_mech_f = s.create_numeric(hf, "t_mech", 'int8')
            t_mech_f.data.write(t_mech)

            t_mech_freetext =\
                np.asarray(["bloodxyz", "", "", "", "", "", "swabxyz", "selfxyz", "", "", "", "fingerxyz"])
            t_mech_freetext_f = s.create_indexed_string(hf, "t_mech_freetext")
            t_mech_freetext_f.data.write(t_mech_freetext)
            pcr1 = s.create_numeric(hf, 'pcr1', 'bool')
            pcr2 = s.create_numeric(hf, 'pcr2', 'bool')
            pcr3 = s.create_numeric(hf, 'pcr3', 'bool')
            atb1 = s.create_numeric(hf, 'atb1', 'bool')
            atb2 = s.create_numeric(hf, 'atb2', 'bool')
            atb3 = s.create_numeric(hf, 'atb3', 'bool')
            test_type_from_mechanism_v1(s, t_mech_f, t_mech_freetext_f, pcr1,
                                        pcr2, pcr3, atb1, atb2, atb3)
            self.assertTrue(
                np.array_equal(
                    pcr1.data[:],
                    np.asarray([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
                               dtype=np.bool)))
            self.assertTrue(
                np.array_equal(
                    pcr2.data[:],
                    np.asarray([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                               dtype=np.bool)))
            self.assertTrue(
                np.array_equal(
                    pcr3.data[:],
                    np.asarray([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                               dtype=np.bool)))
            self.assertTrue(
                np.array_equal(
                    atb1.data[:],
                    np.asarray([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
                               dtype=np.bool)))
            self.assertTrue(
                np.array_equal(
                    atb2.data[:],
                    np.asarray([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                               dtype=np.bool)))
            self.assertTrue(
                np.array_equal(
                    atb3.data[:],
                    np.asarray([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                               dtype=np.bool)))

예제 #9

0

파일 보기

def first_test_date_per_patient(session: Session,
                                patient_table,
                                test_table,
                                test_date_name,
                                dest_patient_table,
                                dest_patient_name):
    """
    Filter the first date of test performed for each patient id.

    :param session: The Exetera session instance.
    :param patient_table: The patient dataframe.
    :param test_table: The tests dataframe.
    :param test_date_name: The name of the test dataframe, not used.
    :param dest_patient_table: The destination dataframe to store the results.
    :param dest_patient_name: The name of the destination field to store the results.
    """

    pid = 'id'
    pids = session.get(patient_table[pid])
    pids_ = pids.data[:]
    if not ops.is_ordered(pids.data[:]):
        raise ValueError("The patient table must be ordered by '{}'".format(pid))

    t_pid = 'patient_id'
    t_pids = session.get(test_table[t_pid])
    t_pids_ = t_pids.data[:]
    if not ops.is_ordered(t_pids_):
        raise ValueError("The test table must be ordered by '{}'".format(t_pid))

    # collapse the test data by patient_id and get the counts
    cats = session.get(test_table['created_at'])
    spans_ = session.get_spans(t_pids_)
    s_t_pids_ = session.apply_spans_first(spans_, t_pids_)
    counts_ = session.apply_spans_first(spans_, cats)

    # merge the counts for the test table into the patient table
    dest = session.create_numeric(dest_patient_table, dest_patient_name, 'int32')
    session.ordered_merge_left(left_on=pids_, right_on=s_t_pids_, right_field_sources=(counts_,),
                               left_field_sinks=(dest,), left_unique=True, right_unique=True)

예제 #10

0

파일 보기

 def test_test_type_from_mechanism_v1_numpy(self):
     s = Session()
     t_mech = np.asarray([-1, 0, 1, 2, 3, 4, -1, -1, 5, 6, 7, -1])
     t_mech_freetext =\
         np.asarray(["bloodxyz", "", "", "", "", "", "swabxyz", "selfxyz", "", "", "", "fingerxyz"])
     pcr1 = np.zeros(len(t_mech), dtype=np.bool)
     pcr2 = np.zeros(len(t_mech), dtype=np.bool)
     pcr3 = np.zeros(len(t_mech), dtype=np.bool)
     atb1 = np.zeros(len(t_mech), dtype=np.bool)
     atb2 = np.zeros(len(t_mech), dtype=np.bool)
     atb3 = np.zeros(len(t_mech), dtype=np.bool)
     test_type_from_mechanism_v1(s, t_mech, t_mech_freetext, pcr1, pcr2,
                                 pcr3, atb1, atb2, atb3)
     self.assertTrue(
         np.array_equal(
             pcr1,
             np.asarray([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
                        dtype=np.bool)))
     self.assertTrue(
         np.array_equal(
             pcr2,
             np.asarray([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                        dtype=np.bool)))
     self.assertTrue(
         np.array_equal(
             pcr3,
             np.asarray([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                        dtype=np.bool)))
     self.assertTrue(
         np.array_equal(
             atb1,
             np.asarray([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
                        dtype=np.bool)))
     self.assertTrue(
         np.array_equal(
             atb2,
             np.asarray([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        dtype=np.bool)))
     self.assertTrue(
         np.array_equal(
             atb3,
             np.asarray([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                        dtype=np.bool)))

예제 #11

0

파일 보기

파일: test_session.py 프로젝트: clyyuanzi-london/ExeTera

    def test_write_then_read_numeric(self):
        from exetera.core.session import Session
        from exetera.core import fields
        from exetera.core.utils import Timer

        s = Session()
        bio = BytesIO()
        with h5py.File(bio, 'w') as hf:
            np.random.seed(12345678)
            values = np.random.randint(low=0, high=1000000, size=100000000)
            fields.numeric_field_constructor(s, hf, 'a', 'int32')
            a = fields.NumericField(s, hf['a'], write_enabled=True)
            a.data.write(values)

            total = np.sum(a.data[:])
            self.assertEqual(49997540637149, total)

            a.data[:] = a.data[:] * 2
            total = np.sum(a.data[:])
            self.assertEqual(99995081274298, total)

예제 #12

0

파일 보기

파일: test_session.py 프로젝트: clyyuanzi-london/ExeTera

    def test_write_then_read_categorical(self):
        from exetera.core.session import Session
        from exetera.core import fields
        from exetera.core.utils import Timer

        s = Session()
        bio = BytesIO()
        with h5py.File(bio, 'w') as hf:
            np.random.seed(12345678)
            values = np.random.randint(low=0, high=3, size=100000000)
            fields.categorical_field_constructor(s, hf, 'a', 'int8', {
                'foo': 0,
                'bar': 1,
                'boo': 2
            })
            a = fields.CategoricalField(s, hf['a'], write_enabled=True)
            a.data.write(values)

            total = np.sum(a.data[:])
            self.assertEqual(99987985, total)

예제 #13

0

파일 보기

    def test_concatenate_daily(self):

        ids = np.asarray([
            'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c',
            'c', 'c'
        ],
                         dtype='S1')
        days = np.asarray([
            '2020-05-06', '2020-05-06', '2020-05-07', '2020-06-02',
            '2020-06-02', '2020-08-01', '2020-08-20', '2020-09-05',
            '2020-04-10', '2020-04-11', '2020-04-11', '2020-04-11',
            '2020-04-11', '2020-04-11', '2020-04-11'
        ],
                          dtype='S10')
        idf = [
            'a', "'b'", 'what', 'some, information', 'x', '', 'foo', 'flop',
            "'dun'", "'mun'", "'race, track?'", '', "for, too", 'z', 'now!'
        ]
        nums = np.asarray(
            [5, 6, 3, 2, 1, 10, 230, 3, 5, -20, -4, 2, 6, 100, 40],
            dtype=np.int32)

        bio = BytesIO()
        with Session() as s:
            ds = s.open_dataset(bio, 'w', 'ds')
            src = ds.create_group('src')
            ids_f = s.create_fixed_string(src, 'patient_id', 1)
            ids_f.data.write(ids)
            days_f = s.create_fixed_string(src, 'created_at_day', 10)
            days_f.data.write(days)
            idf_f = s.create_indexed_string(src, 'idf')
            idf_f.data.write(idf)
            nums_f = s.create_numeric(src, 'nums', 'int32')
            nums_f.data.write(nums)

            dest = ds.create_group('dest')

            merge_daily_assessments_v1(s, src, dest)
            print(dest.keys())

            print(s.get(dest['idf']).data[:])

예제 #14

0

파일 보기

파일: test_session.py 프로젝트: clyyuanzi-london/ExeTera

    def test_write_then_read_indexed_string(self):
        from exetera.core.session import Session
        from exetera.core import fields
        from exetera.core.utils import Timer

        s = Session()
        bio = BytesIO()
        with h5py.File(bio, 'w') as hf:
            np.random.seed(12345678)
            values = np.random.randint(low=0, high=4, size=200000)
            svalues = [''.join(['x'] * v) for v in values]
            fields.indexed_string_field_constructor(s, hf, 'a', 8)
            a = fields.IndexedStringField(s, hf['a'], write_enabled=True)
            a.data.write(svalues)

            total = np.unique(a.data[:])
            self.assertListEqual(['', 'x', 'xx', 'xxx'], total.tolist())

            strs = a.data[:]
            strs = [s + 'y' for s in strs]
            a.data.clear()
            a.data.write(strs)

            # print(strs[:10])
            self.assertListEqual([
                'xxxy', 'xxy', 'xxxy', 'y', 'xy', 'y', 'xxxy', 'xxxy', 'xy',
                'y'
            ], strs[:10])
            # print(a.indices[:10])
            self.assertListEqual([0, 4, 7, 11, 12, 14, 15, 19, 23, 25],
                                 a.indices[:10].tolist())
            # print(a.values[:10])
            self.assertListEqual(
                [120, 120, 120, 121, 120, 120, 121, 120, 120, 120],
                a.values[:10].tolist())
            # print(a.data[:10])
            self.assertListEqual([
                'xxxy', 'xxy', 'xxxy', 'y', 'xy', 'y', 'xxxy', 'xxxy', 'xy',
                'y'
            ], a.data[:10])

예제 #15

0

파일 보기

파일: test_session.py 프로젝트: clyyuanzi-london/ExeTera

    def test_write_then_read_fixed_string(self):
        from exetera.core.session import Session
        from exetera.core import fields
        from exetera.core.utils import Timer

        s = Session()
        bio = BytesIO()
        with h5py.File(bio, 'w') as hf:
            np.random.seed(12345678)
            values = np.random.randint(low=0, high=4, size=1000000)
            svalues = [b''.join([b'x'] * v) for v in values]
            fields.fixed_string_field_constructor(s, hf, 'a', 8)
            a = fields.FixedStringField(s, hf['a'], write_enabled=True)
            a.data.write(svalues)

            total = np.unique(a.data[:])
            self.assertListEqual([b'', b'x', b'xx', b'xxx'], total.tolist())

            a.data[:] = np.core.defchararray.add(a.data[:], b'y')
            self.assertListEqual([
                b'xxxy', b'xxy', b'xxxy', b'y', b'xy', b'y', b'xxxy', b'xxxy',
                b'xy', b'y'
            ], a.data[:10].tolist())

예제 #16

0

파일 보기

    def test_covid_test_date_v1_positive_test(self):
        bio = BytesIO()
        with Session() as s:
            # t_pids_ = np.asarray([b'a', b'a', b'b', b'b', b'b', b'c', b'c', b'e',
            #                       b'e', b'f', b'g', b'h', b'i', b'i'])
            t_dates_exact = np.asarray([
                0.0,
                dt(2020, 10, 12).timestamp(),
                dt(2020, 6, 2).timestamp(),
                0.0,
                dt(2021, 1, 30).timestamp(),
                0.0,  # 5
                0.0,
                0.0,
                dt(2020, 8, 10).timestamp(),  # 8
                dt(2020, 12, 1).timestamp(),
                dt(2020, 9, 2).timestamp(),
                0.0  # 11
            ])
            t_dates_from = np.asarray([
                dt(2020, 5, 12).timestamp(),
                0.0,
                0.0,
                dt(2020, 9, 2).timestamp(),
                0.0,
                0.0,  # 5
                dt(2020, 7, 16).timestamp(),
                0.0,
                dt(2021, 8, 8).timestamp(),  # 8
                0.0,
                dt(2020, 8, 10).timestamp(),
                dt(2020, 11, 4).timestamp(),  # 11
            ])
            t_dates_to = np.asarray([
                dt(2020, 5, 17).timestamp(),
                0.0,
                0.0,
                dt(2020, 9, 3).timestamp(),
                0.0,
                0.0,  # 5
                0.0,
                dt(2020, 6, 20).timestamp(),
                0.0,  # 8
                dt(2020, 12, 19).timestamp(),
                dt(2020, 10, 5).timestamp(),
                dt(2020, 11, 3).timestamp()  # 11
            ])

            print(dt(2020, 11, 3).timestamp())

            src = s.open_dataset(bio, 'w', 'src')
            tests = src.create_group('tests')
            s.create_timestamp(tests,
                               'date_taken_specific').data.write(t_dates_exact)
            s.create_timestamp(
                tests, 'date_taken_between_start').data.write(t_dates_from)
            s.create_timestamp(tests,
                               'date_taken_between_end').data.write(t_dates_to)

            covid_test_date_v1(s, tests, tests)
            print(s.get(tests['test_date']).data[:])
            print(s.get(tests['test_date_valid']).data[:])

예제 #17

0

파일 보기

파일: other_symptoms.py 프로젝트: deng113jie/ExeTeraCovid

    for i in range(len(starts)):
        substrs = text[starts[i]:ends[i]].tobytes().decode()
        # if ' - ' in substrs:
        #     print(substrs)
        substrs = replace_multi_with_str("#!,\"(){}[].:;", substrs)
        substrs = [s_.strip() for s_ in substrs.split() if len(s_) > 0]
        for s in substrs:
            if s in words_to_check:
                total_count += 1
                break
    print(total_count)


with h5py.File('/home/ben/covid/ds_20200901_full.hdf5', 'r') as hf:
    with h5py.File('/home/ben/covid/ds_20200901_othersymp.hdf5', 'w') as tmp:
        s = Session()
        print([k for k in hf['patients'].keys() if 'result' in k])

        old_test = s.get(hf['patients']['max_assessment_test_result']).data[:]
        new_test = s.get(hf['patients']['max_test_result']).data[:]
        test_results = np.where((old_test == 3) | (new_test == 4), 2, 0)
        test_results = np.where(
            (test_results == 0) & ((old_test == 2) | (new_test == 3)), 1,
            test_results)
        p_test_results = s.create_numeric(tmp, 'p_test_results', 'int8')
        p_test_results.data.write(test_results)
        print("overall tests:", np.unique(test_results, return_counts=True))

        other = s.get(hf['assessments']['other_symptoms'])
        cc = s.get(hf['assessments']['country_code']).data[:]
        otherstart = other.indices[:-1]

예제 #18

0

파일 보기

파일: journal_test.py 프로젝트: clyyuanzi-london/ExeTera

import time
from exetera.core.load_schema import load_schema
from exetera.core.session import Session
from exetera.core.journal import journal_test_harness

schema_fn = '/home/ben/covid/covid_schema.json'
old_fn = '/home/ben/covid/ds_20200801_base.hdf5'
new_fn = '/home/ben/covid/ds_20200901_base.hdf5'
dest_fn = '/home/ben/covid/ds_journal.hdf5'

with open(schema_fn) as f:
    schema = load_schema(f)
journal_test_harness(Session(), schema, old_fn, new_fn, dest_fn)

예제 #19

0

파일 보기

def hs_test_1(length, val_column_count):
    # rng = np.random.RandomState(12345678)
    # id_base = 1000000000
    # mapping = [0, 1, 2, 1]
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf:
        with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest:
            # print('creating a_ids')
            # a_ids = generate_a_ids(length, id_base)
            # a_ids_f = s.create_numeric(hf, 'a_ids', 'int64')
            # a_ids_f.data.write(a_ids)
            # del a_ids
            #
            # print('creating a_vals')
            # # all_a_val_fields = list()
            # for v in range(val_column_count):
            #     a_vals = generate_a_vals(length, 0, 100, rng)
            #     a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64')
            #     a_vals_f.data.write(a_vals)
            #     # all_a_val_fields.append(a_vals_f)
            #     del a_vals
            #
            # print('creating b_ids')
            # b_ids = generate_b_ids(length, id_base, mapping)
            # b_ids_f = s.create_numeric(hf, 'b_ids', 'int64')
            # b_ids_f.data.write(b_ids)
            # del b_ids

            a_ids_f = s.get(hf['a_ids'])
            b_ids_f = s.get(hf['b_ids'])

            all_b_val_fields = list()
            for v in range(val_column_count):
                b_vals_f = s.create_numeric(dest, 'b_vals_{}'.format(v),
                                            'int32')
                all_b_val_fields.append(b_vals_f)

            a_to_b = s.create_numeric(dest, 'a_to_b', 'int64')

            all_a_val_fields = list()
            for v in range(val_column_count):
                a_vals_f = s.get(hf['a_vals_{}'.format(v)])
                all_a_val_fields.append(a_vals_f)

            print("running test")
            t0 = time.time()
            # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True,
            #                      left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,))
            print(b_ids_f.data[:100])
            print(a_ids_f.data[:100])
            s.ordered_merge_left(b_ids_f,
                                 a_ids_f,
                                 right_field_sources=tuple(all_a_val_fields),
                                 left_field_sinks=tuple(all_b_val_fields),
                                 left_to_right_map=a_to_b,
                                 right_unique=True)
            print(a_to_b.data[:100])
            results = s.merge_left(b_ids_f,
                                   a_ids_f,
                                   right_fields=tuple(all_a_val_fields))
            elapsed = time.time() - t0
            print(elapsed)
            print(all_b_val_fields[0].data[:100])
            print(results[0][:100])

예제 #20

0

파일 보기

파일: postprocess.py 프로젝트: deng113jie/ExeTeraCovid

def postprocess(dataset, destination, timestamp=None, flags=None):

    if flags is None:
        flags = set()

    do_daily_asmts = 'daily' in flags
    has_patients = 'patients' in dataset.keys()
    has_assessments = 'assessments' in dataset.keys()
    has_tests = 'tests' in dataset.keys()
    has_diet = 'diet' in dataset.keys()

    sort_enabled = lambda x: True
    process_enabled = lambda x: True

    sort_patients = sort_enabled(flags) and True
    sort_assessments = sort_enabled(flags) and True
    sort_tests = sort_enabled(flags) and True
    sort_diet = sort_enabled(flags) and True

    make_assessment_patient_id_fkey = process_enabled(flags) and True
    year_from_age = process_enabled(flags) and True
    clean_weight_height_bmi = process_enabled(flags) and True
    health_worker_with_contact = process_enabled(flags) and True
    clean_temperatures = process_enabled(flags) and True
    check_symptoms = process_enabled(flags) and True
    create_daily = process_enabled(flags) and do_daily_asmts
    make_patient_level_assessment_metrics = process_enabled(flags) and True
    make_patient_level_daily_assessment_metrics = process_enabled(
        flags) and do_daily_asmts
    make_new_test_level_metrics = process_enabled(flags) and True
    make_diet_level_metrics = True
    make_healthy_diet_index = True

    # ds = DataStore(timestamp=timestamp)
    s = Session()

    # patients ================================================================

    sorted_patients_src = None

    if has_patients:
        patients_src = dataset['patients']

        write_mode = 'write'

        if 'patients' not in destination.keys():
            patients_dest = s.get_or_create_group(destination, 'patients')
            sorted_patients_src = patients_dest

            # Patient sort
            # ============
            if sort_patients:
                duplicate_filter = \
                    persistence.filter_duplicate_fields(s.get(patients_src['id']).data[:])

                for k in patients_src.keys():
                    t0 = time.time()
                    r = s.get(patients_src[k])
                    w = r.create_like(patients_dest, k)
                    s.apply_filter(duplicate_filter, r, w)
                    print(f"'{k}' filtered in {time.time() - t0}s")

                print(np.count_nonzero(duplicate_filter == True),
                      np.count_nonzero(duplicate_filter == False))
                sort_keys = ('id', )
                s.sort_on(patients_dest,
                          patients_dest,
                          sort_keys,
                          write_mode='overwrite')

            # Patient processing
            # ==================
            if year_from_age:
                log("year of birth -> age; 18 to 90 filter")
                t0 = time.time()
                yobs = s.get(patients_dest['year_of_birth'])
                yob_filter = s.get(patients_dest['year_of_birth_valid'])
                age = s.create_numeric(patients_dest, 'age', 'uint32')
                age_filter = s.create_numeric(patients_dest, 'age_filter',
                                              'bool')
                age_16_to_90 = s.create_numeric(patients_dest,
                                                '16_to_90_years', 'bool')
                print('year_of_birth:', patients_dest['year_of_birth'])
                for k in patients_dest['year_of_birth'].attrs.keys():
                    print(k, patients_dest['year_of_birth'].attrs[k])
                calculate_age_from_year_of_birth_v1(yobs, yob_filter, 16, 90,
                                                    age, age_filter,
                                                    age_16_to_90, 2020)
                log(f"completed in {time.time() - t0}")

                print('age_filter count:',
                      np.sum(patients_dest['age_filter']['values'][:]))
                print('16_to_90_years count:',
                      np.sum(patients_dest['16_to_90_years']['values'][:]))

            if clean_weight_height_bmi:
                log("height / weight / bmi; standard range filters")
                t0 = time.time()

                weights_clean = s.create_numeric(patients_dest,
                                                 'weight_kg_clean', 'float32')
                weights_filter = s.create_numeric(patients_dest,
                                                  '40_to_200_kg', 'bool')
                heights_clean = s.create_numeric(patients_dest,
                                                 'height_cm_clean', 'float32')
                heights_filter = s.create_numeric(patients_dest,
                                                  '110_to_220_cm', 'bool')
                bmis_clean = s.create_numeric(patients_dest, 'bmi_clean',
                                              'float32')
                bmis_filter = s.create_numeric(patients_dest, '15_to_55_bmi',
                                               'bool')

                weight_height_bmi_v1(s, 40, 200, 110, 220, 15, 55, None, None,
                                     None, None, patients_dest['weight_kg'],
                                     patients_dest['weight_kg_valid'],
                                     patients_dest['height_cm'],
                                     patients_dest['height_cm_valid'],
                                     patients_dest['bmi'],
                                     patients_dest['bmi_valid'], weights_clean,
                                     weights_filter, None, heights_clean,
                                     heights_filter, None, bmis_clean,
                                     bmis_filter, None)
                log(f"completed in {time.time() - t0}")

            if health_worker_with_contact:
                with utils.Timer("health_worker_with_contact field"):
                    #writer = ds.get_categorical_writer(patients_dest, 'health_worker_with_contact', 'int8')
                    combined_hcw_with_contact_v1(
                        s, s.get(patients_dest['healthcare_professional']),
                        s.get(patients_dest['contact_health_worker']),
                        s.get(patients_dest['is_carer_for_community']),
                        patients_dest, 'health_worker_with_contact')

    # assessments =============================================================

    sorted_assessments_src = None
    if has_assessments:
        assessments_src = dataset['assessments']
        if 'assessments' not in destination.keys():
            assessments_dest = s.get_or_create_group(destination,
                                                     'assessments')
            sorted_assessments_src = assessments_dest

            if sort_assessments:
                sort_keys = ('patient_id', 'created_at')
                with utils.Timer("sorting assessments"):
                    s.sort_on(assessments_src, assessments_dest, sort_keys)

            if has_patients:
                if make_assessment_patient_id_fkey:
                    print(
                        "creating 'assessment_patient_id_fkey' foreign key index for 'patient_id'"
                    )
                    t0 = time.time()
                    patient_ids = s.get(sorted_patients_src['id'])
                    assessment_patient_ids =\
                        s.get(sorted_assessments_src['patient_id'])
                    assessment_patient_id_fkey =\
                        s.create_numeric(assessments_dest, 'assessment_patient_id_fkey', 'int64')
                    s.get_index(patient_ids.data[:],
                                assessment_patient_ids.data[:],
                                assessment_patient_id_fkey)
                    print(f"completed in {time.time() - t0}s")

            if clean_temperatures:
                print("clean temperatures")
                t0 = time.time()
                temps = s.get(sorted_assessments_src['temperature'])
                temp_units = s.get(sorted_assessments_src['temperature_unit'])
                temps_valid = s.get(
                    sorted_assessments_src['temperature_valid'])
                dest_temps = temps.create_like(assessments_dest,
                                               'temperature_c_clean')
                dest_temps_valid = temps_valid.create_like(
                    assessments_dest, 'temperature_35_to_42_inclusive')
                dest_temps_modified = temps_valid.create_like(
                    assessments_dest, 'temperature_modified')
                validate_temperature_v1(s, 35.0, 42.0, temps, temp_units,
                                        temps_valid, dest_temps,
                                        dest_temps_valid, dest_temps_modified)
                print(f"temperature cleaning done in {time.time() - t0}")

            if check_symptoms:
                print('check inconsistent health_status')
                t0 = time.time()
                check_inconsistent_symptoms_v1(s, sorted_assessments_src,
                                               assessments_dest)
                print(time.time() - t0)

    # tests ===================================================================

    if has_tests:
        if sort_tests:
            tests_src = dataset['tests']
            tests_dest = s.get_or_create_group(destination, 'tests')
            sort_keys = ('patient_id', 'created_at')
            s.sort_on(tests_src, tests_dest, sort_keys)

    # diet ====================================================================

    if has_diet:
        diet_src = dataset['diet']
        if 'diet' not in destination.keys():
            diet_dest = s.get_or_create_group(destination, 'diet')
            sorted_diet_src = diet_dest
            if sort_diet:
                sort_keys = ('patient_id', 'display_name', 'id')
                s.sort_on(diet_src, diet_dest, sort_keys)

    if has_assessments:
        if do_daily_asmts:
            daily_assessments_dest = s.get_or_create_group(
                destination, 'daily_assessments')

    # post process patients
    # TODO: need an transaction table

    print(patients_src.keys())
    print(dataset['assessments'].keys())
    print(dataset['tests'].keys())

    # write_mode = 'overwrite'
    write_mode = 'write'

    # Daily assessments
    # =================

    if has_assessments:
        if create_daily:
            print("generate daily assessments")
            patient_ids = s.get(sorted_assessments_src['patient_id'])
            created_at_days = s.get(sorted_assessments_src['created_at_day'])
            raw_created_at_days = created_at_days.data[:]

            if 'assessment_patient_id_fkey' in assessments_src.keys():
                patient_id_index = assessments_src[
                    'assessment_patient_id_fkey']
            else:
                patient_id_index = assessments_dest[
                    'assessment_patient_id_fkey']
            patient_id_indices = s.get(patient_id_index)
            raw_patient_id_indices = patient_id_indices.data[:]

            print("Calculating patient id index spans")
            t0 = time.time()
            patient_id_index_spans = s.get_spans(
                fields=(raw_patient_id_indices, raw_created_at_days))
            print(
                f"Calculated {len(patient_id_index_spans)-1} spans in {time.time() - t0}s"
            )

            print("Applying spans to 'health_status'")
            t0 = time.time()
            default_behavour_overrides = {
                'id': s.apply_spans_last,
                'patient_id': s.apply_spans_last,
                'patient_index': s.apply_spans_last,
                'created_at': s.apply_spans_last,
                'created_at_day': s.apply_spans_last,
                'updated_at': s.apply_spans_last,
                'updated_at_day': s.apply_spans_last,
                'version': s.apply_spans_max,
                'country_code': s.apply_spans_first,
                'date_test_occurred': None,
                'date_test_occurred_guess': None,
                'date_test_occurred_day': None,
                'date_test_occurred_set': None,
            }
            for k in sorted_assessments_src.keys():
                t1 = time.time()
                reader = s.get(sorted_assessments_src[k])
                if k in default_behavour_overrides:
                    apply_span_fn = default_behavour_overrides[k]
                    if apply_span_fn is not None:
                        apply_span_fn(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    else:
                        print(f"  Skipping field {k}")
                else:
                    if isinstance(reader, fields.CategoricalField):
                        s.apply_spans_max(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    elif isinstance(reader, rw.IndexedStringReader):
                        s.apply_spans_concat(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    elif isinstance(reader, rw.NumericReader):
                        s.apply_spans_max(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    else:
                        print(f"  No function for {k}")

            print(f"apply_spans completed in {time.time() - t0}s")

    if has_patients and has_assessments:
        if make_patient_level_assessment_metrics:
            if 'assessment_patient_id_fkey' in assessments_dest:
                src = assessments_dest['assessment_patient_id_fkey']
            else:
                src = assessments_src['assessment_patient_id_fkey']
            assessment_patient_id_fkey = s.get(src)
            # generate spans from the assessment-space patient_id foreign key
            spans = s.get_spans(field=assessment_patient_id_fkey.data[:])

            ids = s.get(patients_dest['id'])

            print('calculate assessment counts per patient')
            t0 = time.time()
            writer = s.create_numeric(patients_dest, 'assessment_count',
                                      'uint32')
            aggregated_counts = s.apply_spans_count(spans)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated assessment counts per patient in {time.time() - t0}"
            )

            print('calculate first assessment days per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['created_at_day'])
            writer = s.create_fixed_string(patients_dest,
                                           'first_assessment_day', 10)
            aggregated_counts = s.apply_spans_first(spans, reader)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated first assessment days per patient in {time.time() - t0}"
            )

            print('calculate last assessment days per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['created_at_day'])
            writer = s.create_fixed_string(patients_dest,
                                           'last_assessment_day', 10)
            aggregated_counts = s.apply_spans_last(spans, reader)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated last assessment days per patient in {time.time() - t0}"
            )

            print('calculate maximum assessment test result per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['tested_covid_positive'])
            writer = reader.create_like(patients_dest,
                                        'max_assessment_test_result')
            max_result_value = s.apply_spans_max(spans, reader)
            s.join(ids, assessment_patient_id_fkey, max_result_value, writer,
                   spans)
            print(
                f"calculated maximum assessment test result in {time.time() - t0}"
            )

    if has_assessments and do_daily_asmts and make_patient_level_daily_assessment_metrics:
        print(
            "creating 'daily_assessment_patient_id_fkey' foreign key index for 'patient_id'"
        )
        t0 = time.time()
        patient_ids = s.get(sorted_patients_src['id'])
        daily_assessment_patient_ids =\
            s.get(daily_assessments_dest['patient_id'])
        daily_assessment_patient_id_fkey =\
            s.create_numeric(daily_assessments_dest, 'daily_assessment_patient_id_fkey', 'int64')
        s.get_index(patient_ids, daily_assessment_patient_ids,
                    daily_assessment_patient_id_fkey)
        print(f"completed in {time.time() - t0}s")

        spans = s.get_spans(field=s.get(
            daily_assessments_dest['daily_assessment_patient_id_fkey']))

        print('calculate daily assessment counts per patient')
        t0 = time.time()
        writer = s.create_numeric(patients_dest, 'daily_assessment_count',
                                  'uint32')
        aggregated_counts = s.apply_spans_count(spans)
        daily_assessment_patient_id_fkey =\
            s.get(daily_assessments_dest['daily_assessment_patient_id_fkey'])
        s.join(ids, daily_assessment_patient_id_fkey, aggregated_counts,
               writer, spans)
        print(
            f"calculated daily assessment counts per patient in {time.time() - t0}"
        )

    if has_tests and make_new_test_level_metrics:
        print(
            "creating 'test_patient_id_fkey' foreign key index for 'patient_id'"
        )
        t0 = time.time()
        patient_ids = s.get(sorted_patients_src['id'])
        test_patient_ids = s.get(tests_dest['patient_id'])
        test_patient_id_fkey = s.create_numeric(tests_dest,
                                                'test_patient_id_fkey',
                                                'int64')
        s.get_index(patient_ids, test_patient_ids, test_patient_id_fkey)
        test_patient_id_fkey = s.get(tests_dest['test_patient_id_fkey'])
        spans = s.get_spans(field=test_patient_id_fkey)
        print(f"completed in {time.time() - t0}s")

        print('calculate test_counts per patient')
        t0 = time.time()
        writer = s.create_numeric(patients_dest, 'test_count', 'uint32')
        aggregated_counts = s.apply_spans_count(spans)
        s.join(ids, test_patient_id_fkey, aggregated_counts, writer, spans)
        print(f"calculated test counts per patient in {time.time() - t0}")

        print('calculate test_result per patient')
        t0 = time.time()
        test_results = s.get(tests_dest['result'])
        writer = test_results.create_like(patients_dest, 'max_test_result')
        aggregated_results = s.apply_spans_max(spans, test_results)
        s.join(ids, test_patient_id_fkey, aggregated_results, writer, spans)
        print(f"calculated max_test_result per patient in {time.time() - t0}")

    if has_diet and make_diet_level_metrics:
        with utils.Timer("Making patient-level diet questions count",
                         new_line=True):
            d_pids_ = s.get(diet_dest['patient_id']).data[:]
            d_pid_spans = s.get_spans(d_pids_)
            d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_)
            d_pid_counts = s.apply_spans_count(d_pid_spans)
            p_diet_counts = s.create_numeric(patients_dest, 'diet_counts',
                                             'int32')
            s.merge_left(left_on=s.get(patients_dest['id']).data[:],
                         right_on=d_distinct_pids,
                         right_fields=(d_pid_counts, ),
                         right_writers=(p_diet_counts, ))

예제 #21

0

파일 보기

파일: merging_results_5.1.py 프로젝트: deng113jie/ExeTeraCovid

    #pat_asymp = ds.apply_indices(spans_asymp, ds.get_reader(out_pos['patient_id']))
    pat_asymp = out_pos['patient_id'].apply_index(spans_asymp)
    filt_asymp = prst.foreign_key_is_in_primary_key(pat_asymp.data[:],
                                                    src_pat['id'].data[:])
    out_pat_asymp = output.create_dataframe('patient_asymp')
    for k in list_interest:
        src_pat[k].create_like(out_pat_asymp, k)
        src_pat[k].apply_filter(filt_asymp, target=out_pat_asymp[k])
        # reader = ds.get_reader(src_pat[k])
        # writer = reader.get_writer(out_pat_asymp, k, ts)
        # ds.apply_filter(filt_asymp, reader, writer)

    # dict_pata = {}
    # for k in list_interest:
    #     values = out_pat_asymp[k].data[:]
    #     dict_pata[k] = values
    #
    # df_pata = pd.DataFrame.from_dict(dict_pata)
    # df_pata.to_csv(path + '/PositiveAsymp_PatDetails.csv')
    save_df_to_csv(out_pat_asymp, 'PositiveAsymp_PatDetails.csv')


if __name__ == "__main__":
    source_file = '/home/jd21/data/post.h5'
    dst_file = '/home/jd21/data/May17_processed_mrslt.hdf5'

    with Session() as s:
        source = s.open_dataset(source_file, 'r', 'src')
        output = s.open_dataset(dst_file, 'w', 'out')
        merging_results(s, source, output)

예제 #22

0

파일 보기

파일: diet_analysis.py 프로젝트: deng113jie/ExeTeraCovid

        self.values[3] |= 1 if o else 0
        self.values[4] |= 0 if m else 1
        self.values[5] |= 1 if m else 0
        self.values[6] |= 0 if c else 1
        self.values[7] |= 1 if c else 0
        self.values[8] |= 0 if d else 1
        self.values[9] |= 1 if d else 0
        self.values[10] |= 0 if z else 1
        self.values[11] |= 1 if z else 0


src_file = '/home/ben/covid/ds_20200929_full.hdf5'
dest_file = '/home/ben/covid/ds_diet_tmp.hdf5'
with h5py.File(src_file, 'r') as hf:
    with h5py.File(dest_file, 'w') as dest:
        s = Session()

        ptnts = hf['patients']
        print(hf['diet'].keys())
        diet = hf['diet']

        p_ids_ = s.get(hf['patients']['id']).data[:]
        d_pids_ = s.get(hf['diet']['patient_id']).data[:]
        d_pid_spans = s.get_spans(d_pids_)
        d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_)
        d_pid_counts = s.apply_spans_count(d_pid_spans)
        print(np.unique(d_pid_counts, return_counts=True))
        p_diet_counts_new = s.create_numeric(dest, 'diet_counts_new', 'int32')
        dcs = s.merge_left(left_on=p_ids_,
                           right_on=d_distinct_pids,
                           right_fields=(d_pid_counts, ),