예제 #1
0
def new_hs_test(vcount):
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf:
        with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest:

            print(hf.keys())

            a_ids_f = s.get(hf['fk_ids'])
            b_ids_f = s.get(hf['ids'])

            all_b_val_fields = list()
            for v in range(vcount):
                b_vals_f = s.create_numeric(dest, 'left_data_{}'.format(v),
                                            'int32')
                all_b_val_fields.append(b_vals_f)

            a_to_b = s.create_numeric(dest, 'a_to_b', 'int64')

            all_a_val_fields = list()
            for v in range(vcount):
                a_vals_f = s.get(hf['right_data_{}'.format(v)])
                all_a_val_fields.append(a_vals_f)

            print("running test")
            t0 = time.time()
            # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True,
            #                      left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,))
            print(a_ids_f.data[:100])
            print(b_ids_f.data[:100])
            print(all_a_val_fields[0].data[:100])
            s.ordered_merge_left(a_ids_f,
                                 b_ids_f,
                                 left_to_right_map=a_to_b,
                                 right_unique=True,
                                 right_field_sources=tuple(all_a_val_fields),
                                 left_field_sinks=tuple(all_b_val_fields))
            print(a_to_b.data[:100])
            results = s.merge_left(a_ids_f,
                                   b_ids_f,
                                   right_fields=tuple(all_a_val_fields))
            elapsed = time.time() - t0
            print("total:", elapsed)
            print(all_b_val_fields[0].data[:100])
            print(results[0][:100])
예제 #2
0
def first_test_date_per_patient(session: Session,
                                patient_table,
                                test_table,
                                test_date_name,
                                dest_patient_table,
                                dest_patient_name):
    """
    Filter the first date of test performed for each patient id.

    :param session: The Exetera session instance.
    :param patient_table: The patient dataframe.
    :param test_table: The tests dataframe.
    :param test_date_name: The name of the test dataframe, not used.
    :param dest_patient_table: The destination dataframe to store the results.
    :param dest_patient_name: The name of the destination field to store the results.
    """

    pid = 'id'
    pids = session.get(patient_table[pid])
    pids_ = pids.data[:]
    if not ops.is_ordered(pids.data[:]):
        raise ValueError("The patient table must be ordered by '{}'".format(pid))

    t_pid = 'patient_id'
    t_pids = session.get(test_table[t_pid])
    t_pids_ = t_pids.data[:]
    if not ops.is_ordered(t_pids_):
        raise ValueError("The test table must be ordered by '{}'".format(t_pid))

    # collapse the test data by patient_id and get the counts
    cats = session.get(test_table['created_at'])
    spans_ = session.get_spans(t_pids_)
    s_t_pids_ = session.apply_spans_first(spans_, t_pids_)
    counts_ = session.apply_spans_first(spans_, cats)

    # merge the counts for the test table into the patient table
    dest = session.create_numeric(dest_patient_table, dest_patient_name, 'int32')
    session.ordered_merge_left(left_on=pids_, right_on=s_t_pids_, right_field_sources=(counts_,),
                               left_field_sinks=(dest,), left_unique=True, right_unique=True)
예제 #3
0
        filt_asmt = tmp.create_group('filt_assessments')
        filt_other_symptoms = other.create_like(filt_asmt, 'other_symptoms')
        s.apply_filter(filter_, other, filt_other_symptoms)
        patient_id = s.get(hf['assessments']['patient_id'])
        filt_patient_id = patient_id.create_like(filt_asmt, 'patient_id')
        s.apply_filter(filter_, patient_id, filt_patient_id)
        print('filtered symptoms len =', len(filt_other_symptoms.data))

        with utils.Timer("merging test_results"):
            p_to_a = s.create_numeric(tmp, 'p_to_a', 'int64')
            a_test_results = s.create_numeric(tmp, 'a_test_results', 'int8')
            s.ordered_merge_left(left_on=s.get(
                tmp['filt_assessments']['patient_id']),
                                 right_on=s.get(hf['patients']['id']),
                                 left_field_sources=(p_test_results, ),
                                 left_field_sinks=(a_test_results, ),
                                 left_to_right_map=p_to_a,
                                 right_unique=True)
        print(len(a_test_results.data))
        print(np.unique(a_test_results.data[:], return_counts=True))

        a_test_results_ = a_test_results.data[:]
        #     filtered_test_results = test_results[filter_]
        #     print("filtered tests:", np.unique(filtered_test_results, return_counts=True))

        indices, text = s.apply_filter(filter_, other)
        istart = indices[:-1]
        iend = indices[1:]
        print(len(indices), len(text))
예제 #4
0
def hs_test_1(length, val_column_count):
    # rng = np.random.RandomState(12345678)
    # id_base = 1000000000
    # mapping = [0, 1, 2, 1]
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf:
        with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest:
            # print('creating a_ids')
            # a_ids = generate_a_ids(length, id_base)
            # a_ids_f = s.create_numeric(hf, 'a_ids', 'int64')
            # a_ids_f.data.write(a_ids)
            # del a_ids
            #
            # print('creating a_vals')
            # # all_a_val_fields = list()
            # for v in range(val_column_count):
            #     a_vals = generate_a_vals(length, 0, 100, rng)
            #     a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64')
            #     a_vals_f.data.write(a_vals)
            #     # all_a_val_fields.append(a_vals_f)
            #     del a_vals
            #
            # print('creating b_ids')
            # b_ids = generate_b_ids(length, id_base, mapping)
            # b_ids_f = s.create_numeric(hf, 'b_ids', 'int64')
            # b_ids_f.data.write(b_ids)
            # del b_ids

            a_ids_f = s.get(hf['a_ids'])
            b_ids_f = s.get(hf['b_ids'])

            all_b_val_fields = list()
            for v in range(val_column_count):
                b_vals_f = s.create_numeric(dest, 'b_vals_{}'.format(v),
                                            'int32')
                all_b_val_fields.append(b_vals_f)

            a_to_b = s.create_numeric(dest, 'a_to_b', 'int64')

            all_a_val_fields = list()
            for v in range(val_column_count):
                a_vals_f = s.get(hf['a_vals_{}'.format(v)])
                all_a_val_fields.append(a_vals_f)

            print("running test")
            t0 = time.time()
            # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True,
            #                      left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,))
            print(b_ids_f.data[:100])
            print(a_ids_f.data[:100])
            s.ordered_merge_left(b_ids_f,
                                 a_ids_f,
                                 right_field_sources=tuple(all_a_val_fields),
                                 left_field_sinks=tuple(all_b_val_fields),
                                 left_to_right_map=a_to_b,
                                 right_unique=True)
            print(a_to_b.data[:100])
            results = s.merge_left(b_ids_f,
                                   a_ids_f,
                                   right_fields=tuple(all_a_val_fields))
            elapsed = time.time() - t0
            print(elapsed)
            print(all_b_val_fields[0].data[:100])
            print(results[0][:100])