예제 #1
0
def generate_dataset(length, val_column_count):
    rng = np.random.RandomState(12345678)
    id_base = 0  #1000000000
    mapping = [0, 1, 2, 1]
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf:
        with utils.Timer('creating a_ids'):
            a_ids = generate_a_ids(length, id_base)
            a_ids_f = s.create_numeric(hf, 'a_ids', 'int64')
            a_ids_f.data.write(a_ids)
            del a_ids

        print('creating a_vals')
        # all_a_val_fields = list()
        for v in range(val_column_count):
            with utils.Timer("creating a_vals[{}]".format(v)):
                a_vals = generate_a_vals(length, 0, 100, rng)
                a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64')
                a_vals_f.data.write(a_vals)
                # all_a_val_fields.append(a_vals_f)
                del a_vals

        with utils.Timer('creating b_ids'):
            b_ids = generate_b_ids(length, id_base, mapping)
            b_ids_f = s.create_numeric(hf, 'b_ids', 'int64')
            b_ids_f.data.write(b_ids)
            del b_ids
예제 #2
0
def raw_np_test_1(length, count):
    rng = np.random.RandomState(12345678)
    for c in range(count):
        vals = generate_a_vals(length, 0, 100, rng)
        with utils.Timer("writing source vals {}".format(c)):
            np.save('/home/ben/covid/test_save/vals_{}'.format(c), vals)

    for c in range(count):
        vname = '/home/ben/covid/test_save/vals_{}.npy'.format(c)
        with utils.Timer("reading {}".format(vname)):
            vals = np.load(vname)
        vals *= 2
        v2name = '/home/ben/covid/test_save/dest_vals_{}'.format(c)
        with utils.Timer("writing {}".format(v2name)):
            np.save(vname, vals)
예제 #3
0
def method_paper_model(ds, symptoms_reader_dict, prediction):
    """
    A leaner model to predict Covid positiveness from symptoms.

    :param ds: The Exetera session instance.
    :param symptoms_reader_dict: The dataframe which stores symptoms data.
    :param prediction: A field to store the prediction result.
    """

    intercept = -1.19015973
    weights = {'persistent_cough': 0.23186655,
               'fatigue': 0.56532346,
               'delirium': -0.12935112,
               'shortness_of_breath': 0.58273967,
               'fever': 0.16580974,
               'diarrhoea': 0.10236126,
               'abdominal_pain': -0.11204163,
               'chest_pain': -0.12318634,
               'hoarse_voice': -0.17818597,
               'skipped_meals': 0.25902482,
               'loss_of_smell': 1.82895239}

    with utils.Timer("predicting covid by assessment", new_line=True):
        cumulative = np.zeros(len(symptoms_reader_dict['persistent_cough']), dtype='float32')
        for s in symptoms_reader_dict:
            cumulative += symptoms_reader_dict[s][:] * weights[s]
        cumulative += intercept
        prediction.write(cumulative)
예제 #4
0
def minimal_test_1(length, count):
    rng = np.random.RandomState(12345678)
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf:
        for c in range(count):
            vals = generate_a_vals(length, 0, 100, rng)
            with utils.Timer("writing source vals {}".format(c)):
                hf.create_dataset("vals_{}".format(c),
                                  chunks=(1 << 20, ),
                                  data=vals)

    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r+') as hf:
        for c in range(count):
            vname = "vals_{}".format(c)
            with utils.Timer("reading {}".format(vname)):
                vals = hf[vname][:]
            vals *= 2
            v2name = "dest_vals_{}".format(c)
            with utils.Timer("writing {}".format(v2name)):
                hf.create_dataset(v2name, chunks=(1 << 20, ), data=vals)
예제 #5
0
def read_id_from_csv(file_name, field_count):
    import csv
    with open(file_name) as f:

        rdr = csv.reader(f)
        fields = next(iter(rdr))
        if field_count == 1:
            ids = list()
            with utils.Timer("reading id from dataset"):
                for r in rdr:
                    ids.append(r[0])
        else:
            values = list()
            for _ in range(field_count):
                values.append(list())
            with utils.Timer(
                    "reading {} fields from dataset".format(field_count)):
                for r in rdr:
                    for i in range(field_count):
                        values[i].append(r[i])
                    del r
예제 #6
0
def read_fields_from_hdf5(file_name, field_count):
    fields = ('id', 'created_at', 'updated_at', 'version', 'country_code',
              'reported_by_another', 'same_household_as_reporter',
              'contact_additional_studies', 'year_of_birth', 'height_cm',
              'weight_kg', 'gender', 'race_other', 'ethnicity',
              'profile_attributes_updated_at', 'has_diabetes')
    print(len(fields))
    s = Session()
    with h5py.File(file_name, 'r') as hf:
        with utils.Timer("reading {} fields from dataset".format(field_count)):
            for f in range(field_count):
                field = s.get(hf['patients'][fields[f]])
                if isinstance(field, flds.IndexedStringField):
                    indices = field.indices[:]
                    values = field.values[:]
                else:
                    data = field.data[:]
def merging_results(s, source, output):
    list_symptoms = [
        'abdominal_pain', 'altered_smell', 'blisters_on_feet', 'brain_fog',
        'chest_pain', 'chills_or_shivers', 'delirium', 'diarrhoea',
        'diarrhoea_frequency', 'dizzy_light_headed', 'ear_ringing', 'earache',
        'eye_soreness', 'fatigue', 'feeling_down', 'fever', 'hair_loss',
        'headache', 'headache_frequency', 'hoarse_voice',
        'irregular_heartbeat', 'loss_of_smell', 'nausea', 'persistent_cough',
        'rash', 'red_welts_on_face_or_lips', 'runny_nose',
        'shortness_of_breath', 'skin_burning', 'skipped_meals', 'sneezing',
        'sore_throat', 'swollen_glands', 'typical_hayfever',
        'unusual_muscle_pains'
    ]

    #path = '/home/jd21/data'
    #ds = DataStore()
    ts = str(datetime.now(timezone.utc))

    # # Same but for test
    src_test = source['tests']
    list_testid = src_test['patient_id']
    list_testcreate = src_test['created_at']
    out_test = output.create_dataframe('tests')
    # ====
    # out_test step 1 copy from src_test
    # ====
    with utils.Timer('applying sort'):
        for k in src_test.keys():
            dataframe.copy(src_test[k], out_test, k)

    # convert test date
    covid_test_date_v1(s, out_test, out_test, 'date_effective_test')

    # Filtering only definite results

    results_raw = out_test['result'].data[:]
    results_filt = np.where(np.logical_or(results_raw == 4, results_raw == 3),
                            True, False)
    for k in out_test.keys():
        out_test[k].apply_filter(results_filt, in_place=True)

    # Filter check
    # sanity_filter = (date_fin == 0)
    # print(np.sum(sanity_filter))

    # Creating clean mechanism
    reader_mec = out_test['mechanism'].data
    s_reader_mec = s.get(out_test['mechanism'])

    print(len(reader_mec), len(out_test['patient_id'].data))

    reader_ftmec = out_test['mechanism_freetext'].data
    s_reader_ftmec = s.get(out_test['mechanism_freetext'])

    test_type_from_mechanism_v1_standard_input(s, out_test)

    pcr_standard_summarize_v1(s, out_test)

    out_test_fin = output.create_dataframe('tests_fin')
    # ====
    # out_test_fin step 1 copy from out_test
    # ====
    writers_dict = {}
    # other fields
    for k in ('patient_id', 'date_effective_test', 'result', 'pcr_standard'):
        values = out_test[k].data[:]
        if k == 'result':
            values -= 3
        writers_dict[k] = out_test[k].create_like(out_test_fin, k, ts).data
        print(len(values), k)
        writers_dict[k].write_part(values)
    # converted_test
    values = np.zeros(len(out_test_fin['patient_id'].data), dtype='bool')
    writers_dict['converted_test'] = out_test_fin.create_numeric(
        'converted_test', 'bool', timestamp=ts).data
    writers_dict['converted_test'].write_part(values)

    # Taking care of the old test
    src_asmt = source['assessments']
    print(src_asmt.keys())

    # # Remap had_covid_test to 0/1 2 to binary 0,1
    # tcp_flat = np.where(src_asmt['tested_covid_positive'].data[:] < 1, 0, 1)
    # spans = src_asmt['patient_id'].get_spans()
    # # Get the first index at which the hct field is maximum
    # firstnz_tcp_ind = ds.apply_spans_index_of_max(spans, tcp_flat)
    # # Get the index of first element of patient_id when sorted
    # first_hct_ind = spans[:-1]
    # filt_tl = first_hct_ind != firstnz_tcp_ind
    # # Get the indices for which hct changed value (indicating that test happened after the first input)
    # sel_max_ind = ds.apply_filter(filter_to_apply=filt_tl, reader=firstnz_tcp_ind)
    # # Get the index at which test is maximum and for which that hct is possible
    # # max_tcp_ind = ds.apply_spans_index_of_max(spans, src_asmt['tested_covid_positive'].data[:])
    # # filt_max_test = ds.apply_indices(filt_tl, max_tcp )
    # sel_max_tcp = ds.apply_indices(filt_tl, firstnz_tcp_ind)
    # sel_maxtcp_ind = ds.apply_filter(filter_to_apply=filt_tl, reader=firstnz_tcp_ind)
    # # Define usable assessments with correct test based on previous filter on indices

    sel_max_ind, sel_max_tcp = multiple_tests_start_with_negative_v1(
        s, src_asmt)

    usable_asmt_tests = output.create_group('usable_asmt_tests')
    # ====
    # usable_asmt_tests step 1: copy from src_asmt, filter patients w/ multiple test and first ok
    # ====
    for k in ('id', 'patient_id', 'created_at', 'had_covid_test'):
        fld = src_asmt[k].create_like(usable_asmt_tests, k)
        src_asmt[k].apply_index(sel_max_ind, target=fld)
        print(usable_asmt_tests[k].data[0])

    src_asmt['created_at'].create_like(usable_asmt_tests, 'eff_result_time')
    src_asmt['created_at'].apply_index(
        sel_max_tcp, target=usable_asmt_tests['eff_result_time'])

    src_asmt['tested_covid_positive'].create_like(usable_asmt_tests,
                                                  'eff_result')
    src_asmt['tested_covid_positive'].apply_index(
        sel_max_tcp, target=usable_asmt_tests['eff_result'])

    src_asmt['tested_covid_positive'].create_like(usable_asmt_tests,
                                                  'tested_covid_positive')
    src_asmt['tested_covid_positive'].apply_index(
        sel_max_tcp, target=usable_asmt_tests['tested_covid_positive'])

    # ====
    # usable_asmt_tests step 2: filter only positive
    # ====
    # Making sure that the test is definite (either positive or negative)
    filt_deftest = usable_asmt_tests['tested_covid_positive'].data[:] > 1
    # print(len(ds.get_reader(usable_asmt_tests['patient_id'])))
    for k in ('id', 'patient_id', 'created_at', 'had_covid_test',
              'tested_covid_positive', 'eff_result_time', 'eff_result'):
        usable_asmt_tests[k].apply_filter(filt_deftest, in_place=True)

    # ====
    # usable_asmt_tests step 3: add delta_days_test, date_final_test, and pcr_standard fields
    # ====
    # Getting difference between created at (max of hct date) and max of test result (eff_result_time)
    reader_hct = usable_asmt_tests['created_at'].data[:]
    reader_tcp = usable_asmt_tests['eff_result_time'].data[:]
    with utils.Timer('doing delta time'):
        delta_time = reader_tcp - reader_hct
        delta_days = delta_time / 86400
    print(delta_days[:10], delta_time[:10])
    writer = usable_asmt_tests.create_numeric('delta_days_test', 'float32')
    writer.data.write(delta_days)

    # Final day of test
    date_final_test = np.where(delta_days < 7, reader_hct,
                               reader_tcp - 2 * 86400)
    writer = usable_asmt_tests.create_timestamp('date_final_test')
    writer.data.write(date_final_test)
    # print(ds.get_reader(usable_asmt_tests['date_final_test'])[:10], date_final_test[:10])

    pcr_standard = np.ones(len(usable_asmt_tests['patient_id'].data))
    writer = usable_asmt_tests.create_numeric('pcr_standard', 'int')
    writer.data.write(pcr_standard)

    # ====
    # out_test_fin step 2 copy from usable_asmt_tests
    # ====
    list_init = ('patient_id', 'date_final_test', 'tested_covid_positive',
                 'pcr_standard')
    list_final = ('patient_id', 'date_effective_test', 'result',
                  'pcr_standard')
    # Join
    for (i, f) in zip(list_init, list_final):
        values = usable_asmt_tests[i].data[:]
        if f == 'result':
            values -= 2
        # writers_dict[f] = reader.get_writer(out_test_fin, f, ts)
        print(len(values), f)
        writers_dict[f].write(values)
    writers_dict['converted_test'].write(
        np.ones(len(usable_asmt_tests['patient_id'].data), dtype='bool'))

    # ====
    # out_pos step 1: copy from out_test_fin, filter valid result, and write to csv
    # ====
    result_fin = out_test_fin['result'].data[:]
    filt_pos = result_fin == 1
    out_pos = output.create_dataframe('out_pos')
    for k in out_test_fin.keys():
        out_test_fin[k].create_like(out_pos, k)
        out_test_fin[k].apply_filter(filt_pos, target=out_pos[k])
        print(k, len(out_test_fin[k].data), len(filt_pos))

    pat_pos_len = len(out_pos['patient_id'].get_spans()) - 1
    dataset.copy(out_pos, output, 'out_pos_copy')
    save_df_to_csv(out_pos, 'TestedPositiveTestDetails.csv')

    # ====
    # out_pos step 2 filter patient that has assessment
    # ====
    with utils.Timer('Mapping index asmt to pos only'):
        test2pat = prst.foreign_key_is_in_primary_key(
            out_pos['patient_id'].data[:],
            foreign_key=src_asmt['patient_id'].data[:])

    for f in [
            'created_at', 'patient_id', 'treatment', 'other_symptoms',
            'country_code', 'location', 'updated_at'
    ] + list_symptoms:
        #print(f)
        if (f in list(out_pos.keys())):
            out_pos[f].data.clear()
            src_asmt[f].apply_filter(test2pat, target=out_pos[f])
        else:
            src_asmt[f].create_like(out_pos, f)
            src_asmt[f].apply_filter(test2pat, target=out_pos[f])

    # print(len(np.unique(ds.get_reader(out_pos['patient_id'])[:])), len(np.unique(pat_pos[:])))
    print(len(out_pos['patient_id'].get_spans()) - 1, pat_pos_len)
    unique_other, counts = np.unique(out_pos['other_symptoms'].data[:],
                                     return_counts=True)
    dict_other = {'other': unique_other, 'counts': counts}

    df_other = pd.DataFrame.from_dict(dict_other)
    df_other.to_csv('OtherSymptoms.csv')

    #  this is duplicated with 265-273
    # for k in list_symptoms:
    #     print(k)
    #     if k in list(out_pos.keys()):
    #         src_asmt[k].apply_filter(test2pat, target=out_pos[k])
    #     else:
    #         src_asmt[k].create_like(out_pos, k)
    #         src_asmt[k].apply_filter(test2pat, target=out_pos[k])
    # reader = ds.get_reader(src_asmt[k])
    # writer = reader.get_writer(out_pos, k,ts,write_mode='overwrite')
    # ds.apply_filter(test2pat, reader,writer)

    # ====
    # summarize the symptoms
    # ====

    # sum_symp = np.zeros(len(out_pos['patient_id'].data))
    # for k in list_symptoms:
    #     values = out_pos[k].data[:]
    #     if k == 'fatigue' or k == 'shortness_of_breath':
    #         values = np.where(values > 2, np.ones_like(values), np.zeros_like(values))
    #     else:
    #         values = np.where(values > 1, np.ones_like(values), np.zeros_like(values))
    #     sum_symp += values
    sum_symp = sum_up_symptons_v1(out_pos)
    out_pos.create_numeric('sum_symp', 'int').data.write(sum_symp)
    # writer = ds.get_numeric_writer(out_pos, 'sum_symp', dtype='int', timestamp=ts, writemode='overwrite')
    # writer.write(sum_symp)

    # ====
    # filter the symptoms
    # ====
    # symp_flat = np.where(out_pos['sum_symp'].data[:] < 1, 0, 1)
    # spans = out_pos['patient_id'].get_spans()
    # print('Number definitie positive is', len(spans) - 1)
    #
    # # Get the first index at which the hct field is maximum
    # firstnz_symp_ind = ds.apply_spans_index_of_max(spans, symp_flat)
    # max_symp_check = symp_flat[firstnz_symp_ind]
    # # Get the index of first element of patient_id when sorted
    #
    # filt_asymptomatic = max_symp_check == 0
    # print('Number asymptomatic is ', len(spans) - 1 - np.sum(max_symp_check), np.sum(filt_asymptomatic))
    #
    # first_symp_ind = spans[:-1]
    # not_healthy_first = first_symp_ind != firstnz_symp_ind
    # print('Number not healthy first is ', len(spans) - 1 - np.sum(not_healthy_first))
    #
    # spans_valid = ds.apply_filter(not_healthy_first, first_symp_ind)
    # pat_sel = ds.apply_indices(spans_valid, out_pos['patient_id'].data[:])
    # filt_sel = prst.foreign_key_is_in_primary_key(pat_sel, out_pos['patient_id'].data[:])
    #
    # spans_asymp = ds.apply_filter(filt_asymptomatic, first_symp_ind)
    spans_asymp, filt_sel = filter_asymp_and_firstnz_v1(s, out_pos)
    # ====
    # out_pos step 3 filter asymptomatic
    # ====
    pat_asymp = out_pos['patient_id'].apply_index(spans_asymp)
    #pat_asymp = ds.apply_indices(spans_asymp, ds.get_reader(out_pos['patient_id']))
    filt_pata = prst.foreign_key_is_in_primary_key(
        pat_asymp.data[:], out_pos['patient_id'].data[:])

    # ====
    # out_pos_hs step 1 copy from out_pos and apply filter not healthy first
    # ====
    out_pos_hs = output.create_dataframe('out_pos_hs')
    for k in list_symptoms + [
            'created_at', 'patient_id', 'sum_symp', 'country_code', 'location',
            'treatment', 'updated_at'
    ]:
        #print(k)
        out_pos[k].create_like(out_pos_hs, k)
        out_pos[k].apply_filter(filt_sel, target=out_pos_hs[k])
        # reader = ds.get_reader(out_pos[k])
        # writer = reader.get_writer(out_pos_hs, k, ts)
        # ds.apply_filter(filt_sel, reader, writer)

    # dict_final = {}
    # for k in out_pos_hs.keys():
    #     dict_final[k] = out_pos_hs[k].data[:]
    #
    # df_final = pd.DataFrame.from_dict(dict_final)
    # df_final.to_csv(path + '/PositiveSympStartHealthyAllSymptoms.csv')
    save_df_to_csv(out_pos_hs, 'PositiveSympStartHealthyAllSymptoms.csv')

    print('out_pos_asymp')
    # ====
    # out_pos_as 1 out_pos filter asymptomatic
    # ====
    out_pos_as = output.create_dataframe('out_pos_asymp')
    for k in list_symptoms + [
            'created_at', 'patient_id', 'sum_symp', 'country_code', 'location',
            'treatment'
    ]:
        out_pos[k].create_like(out_pos_as, k)
        out_pos[k].apply_filter(filt_pata, target=out_pos_as[k])
        # reader = ds.get_reader(out_pos[k])
        # writer = reader.get_writer(out_pos_as, k, ts)
        # ds.apply_filter(filt_pata, reader, writer)

    # dict_finala = {}
    # for k in out_pos_as.keys():
    #     dict_finala[k] = out_pos_as[k].data[:]
    #
    # df_finala = pd.DataFrame.from_dict(dict_finala)
    # df_finala.to_csv(path + '/PositiveAsympAllSymptoms.csv')
    save_df_to_csv(out_pos_as, 'PositiveAsympAllSymptoms.csv')

    # Based on the final selected patient_id, select the appropriate rows of the patient_table
    src_pat = source['patients']
    filt_pat = prst.foreign_key_is_in_primary_key(
        out_pos_hs['patient_id'].data[:], src_pat['id'].data[:])
    list_interest = [
        'has_cancer', 'has_diabetes', 'has_lung_disease', 'has_heart_disease',
        'has_kidney_disease', 'has_asthma', 'race_is_other',
        'race_is_prefer_not_to_say', 'race_is_uk_asian', 'race_is_uk_black',
        'race_is_uk_chinese', 'race_is_uk_middle_eastern',
        'race_is_uk_mixed_other', 'race_is_uk_mixed_white_black',
        'race_is_uk_white', 'race_is_us_asian', 'race_is_us_black',
        'race_is_us_hawaiian_pacific', 'race_is_us_indian_native',
        'race_is_us_white', 'race_other', 'year_of_birth', 'is_smoker',
        'smoker_status', 'bmi_clean', 'is_in_uk_twins',
        'healthcare_professional', 'gender', 'id', 'blood_group', 'lsoa11cd',
        'already_had_covid'
    ]
    out_pat = output.create_dataframe('patient_pos')
    print('patient_pos')
    for k in list_interest:
        src_pat[k].create_like(out_pat, k)
        src_pat[k].apply_filter(filt_pat, target=out_pat[k])
        # reader = ds.get_reader(src_pat[k])
        # writer = reader.get_writer(out_pat, k, ts)
        # ds.apply_filter(filt_pat, reader, writer)

    # dict_pat = {}
    # for k in list_interest:
    #     values = out_pat[k].data[:]
    #     dict_pat[k] = values
    #
    # df_pat = pd.DataFrame.from_dict(dict_pat)
    # df_pat.to_csv(path + '/PositiveSympStartHealthy_PatDetails.csv')
    save_df_to_csv(out_pat, 'PositiveSympStartHealthy_PatDetails.csv')

    #spans_asymp = ds.apply_filter(filt_asymptomatic, first_symp_ind)
    #pat_asymp = ds.apply_indices(spans_asymp, ds.get_reader(out_pos['patient_id']))
    pat_asymp = out_pos['patient_id'].apply_index(spans_asymp)
    filt_asymp = prst.foreign_key_is_in_primary_key(pat_asymp.data[:],
                                                    src_pat['id'].data[:])
    out_pat_asymp = output.create_dataframe('patient_asymp')
    for k in list_interest:
        src_pat[k].create_like(out_pat_asymp, k)
        src_pat[k].apply_filter(filt_asymp, target=out_pat_asymp[k])
        # reader = ds.get_reader(src_pat[k])
        # writer = reader.get_writer(out_pat_asymp, k, ts)
        # ds.apply_filter(filt_asymp, reader, writer)

    # dict_pata = {}
    # for k in list_interest:
    #     values = out_pat_asymp[k].data[:]
    #     dict_pata[k] = values
    #
    # df_pata = pd.DataFrame.from_dict(dict_pata)
    # df_pata.to_csv(path + '/PositiveAsymp_PatDetails.csv')
    save_df_to_csv(out_pat_asymp, 'PositiveAsymp_PatDetails.csv')
예제 #8
0
def read_file_using_fast_csv_reader(source,
                                    chunk_row_size,
                                    column_offsets,
                                    index_map,
                                    field_importer_list=None,
                                    stop_after_rows=None):
    ESCAPE_VALUE = np.frombuffer(b'"', dtype='S1')[0][0]
    SEPARATOR_VALUE = np.frombuffer(b',', dtype='S1')[0][0]
    NEWLINE_VALUE = np.frombuffer(b'\n', dtype='S1')[0][0]
    WHITE_SPACE_VALUE = np.frombuffer(b' ', dtype='S1')[0][0]

    chunk_row_size *= 2
    time0 = time.time()

    total_byte_size, count_columns, count_rows, chunk_byte_size = get_file_stat(
        source, chunk_row_size)

    column_val_total_count = column_offsets[-1]

    with utils.Timer("read_file_using_fast_csv_reader"):
        chunk_index = 0
        hasHeader = True

        accumulated_written_rows = 0

        # initialize column_inds, column_vals ouside of while-loop
        column_inds = np.zeros(
            (count_columns, count_rows + 1),
            dtype=np.int64)  # add one more row for initial index 0

        # column_vals = np.zeros((count_columns, val_row_count), dtype=np.uint8)
        column_vals = np.zeros(np.int64(column_val_total_count),
                               dtype=np.uint8)

        # make ndarray larger factor
        larger_factor = 2
        is_indices_full, is_values_full = False, False

        content = None
        start_index = 0

        ch = 0
        while chunk_index < total_byte_size:
            if stop_after_rows and accumulated_written_rows >= stop_after_rows:
                break

            # reads chunk size of file content
            # when indices or values is full, we need to call fast_csv_reader again, but we don't want to read same content again
            if not is_indices_full and not is_values_full:
                content = np.fromfile(source,
                                      count=chunk_byte_size,
                                      offset=chunk_index,
                                      dtype=np.uint8)
                start_index = 0

                length_content = content.shape[0]
                if length_content == 0:
                    break

                # check if there's newline at EOF in the last chunk. add one if it's missing
                if chunk_index + length_content == total_byte_size and content[
                        -1] != NEWLINE_VALUE:
                    content = np.append(content, NEWLINE_VALUE)

            offset_pos, written_row_count, is_indices_full, is_values_full, val_full_col_idx = fast_csv_reader(
                content, start_index, column_inds, column_vals, column_offsets,
                hasHeader, ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE,
                WHITE_SPACE_VALUE)

            # convert and write
            for ith, i_c in enumerate(index_map):
                if field_importer_list and field_importer_list[ith]:
                    field_importer_list[ith].transform_and_write_part(
                        column_inds, column_vals, column_offsets, i_c,
                        written_row_count)

            # make column_inds larger if it gets full before reach the end of chunk
            if is_indices_full:
                indices_row_count = column_inds.shape[1] - 1
                column_inds = np.zeros(
                    (count_columns,
                     np.uint32(indices_row_count * larger_factor + 1)),
                    dtype=np.int64)

            # make column_values larger if it gets full before reach the end of chunk
            if is_values_full and val_full_col_idx != -1:
                col_val_count = column_offsets[
                    val_full_col_idx + 1] - column_offsets[val_full_col_idx]
                delta = col_val_count * (larger_factor - 1)
                column_offsets = np.concatenate(
                    (column_offsets[:val_full_col_idx + 1],
                     column_offsets[val_full_col_idx + 1:] + np.int64(delta)))
                column_val_total_count = column_offsets[-1]
                column_vals = np.zeros(np.int64(column_val_total_count),
                                       dtype=np.uint8)

            # reassign
            if is_indices_full or is_values_full:
                start_index = offset_pos
            else:
                chunk_index += offset_pos

            hasHeader = False
            accumulated_written_rows += written_row_count
            ch += 1

            print(
                f"{ch} chunks, {accumulated_written_rows} accumulated_written_rows parsed in {time.time() - time0}s"
            )

        # flush at the end
        for ith in range(len(index_map)):
            field_importer_list[ith].flush()

    print(f"Total time {time.time() - time0}s")
예제 #9
0
        ofilter = otherend - otherstart > 0
        print("ofilter:", ofilter.sum(), len(ofilter))
        cfilter = cc == b"GB"
        print("cfilter:", cfilter.sum(), len(cfilter))
        filter_ = ofilter & cfilter
        print("filter_:", filter_.sum(), len(filter_))

        filt_asmt = tmp.create_group('filt_assessments')
        filt_other_symptoms = other.create_like(filt_asmt, 'other_symptoms')
        s.apply_filter(filter_, other, filt_other_symptoms)
        patient_id = s.get(hf['assessments']['patient_id'])
        filt_patient_id = patient_id.create_like(filt_asmt, 'patient_id')
        s.apply_filter(filter_, patient_id, filt_patient_id)
        print('filtered symptoms len =', len(filt_other_symptoms.data))

        with utils.Timer("merging test_results"):
            p_to_a = s.create_numeric(tmp, 'p_to_a', 'int64')
            a_test_results = s.create_numeric(tmp, 'a_test_results', 'int8')
            s.ordered_merge_left(left_on=s.get(
                tmp['filt_assessments']['patient_id']),
                                 right_on=s.get(hf['patients']['id']),
                                 left_field_sources=(p_test_results, ),
                                 left_field_sinks=(a_test_results, ),
                                 left_to_right_map=p_to_a,
                                 right_unique=True)
        print(len(a_test_results.data))
        print(np.unique(a_test_results.data[:], return_counts=True))

        a_test_results_ = a_test_results.data[:]
        #     filtered_test_results = test_results[filter_]
        #     print("filtered tests:", np.unique(filtered_test_results, return_counts=True))
예제 #10
0
def postprocess(dataset, destination, timestamp=None, flags=None):

    if flags is None:
        flags = set()

    do_daily_asmts = 'daily' in flags
    has_patients = 'patients' in dataset.keys()
    has_assessments = 'assessments' in dataset.keys()
    has_tests = 'tests' in dataset.keys()
    has_diet = 'diet' in dataset.keys()

    sort_enabled = lambda x: True
    process_enabled = lambda x: True

    sort_patients = sort_enabled(flags) and True
    sort_assessments = sort_enabled(flags) and True
    sort_tests = sort_enabled(flags) and True
    sort_diet = sort_enabled(flags) and True

    make_assessment_patient_id_fkey = process_enabled(flags) and True
    year_from_age = process_enabled(flags) and True
    clean_weight_height_bmi = process_enabled(flags) and True
    health_worker_with_contact = process_enabled(flags) and True
    clean_temperatures = process_enabled(flags) and True
    check_symptoms = process_enabled(flags) and True
    create_daily = process_enabled(flags) and do_daily_asmts
    make_patient_level_assessment_metrics = process_enabled(flags) and True
    make_patient_level_daily_assessment_metrics = process_enabled(
        flags) and do_daily_asmts
    make_new_test_level_metrics = process_enabled(flags) and True
    make_diet_level_metrics = True
    make_healthy_diet_index = True

    # ds = DataStore(timestamp=timestamp)
    s = Session()

    # patients ================================================================

    sorted_patients_src = None

    if has_patients:
        patients_src = dataset['patients']

        write_mode = 'write'

        if 'patients' not in destination.keys():
            patients_dest = s.get_or_create_group(destination, 'patients')
            sorted_patients_src = patients_dest

            # Patient sort
            # ============
            if sort_patients:
                duplicate_filter = \
                    persistence.filter_duplicate_fields(s.get(patients_src['id']).data[:])

                for k in patients_src.keys():
                    t0 = time.time()
                    r = s.get(patients_src[k])
                    w = r.create_like(patients_dest, k)
                    s.apply_filter(duplicate_filter, r, w)
                    print(f"'{k}' filtered in {time.time() - t0}s")

                print(np.count_nonzero(duplicate_filter == True),
                      np.count_nonzero(duplicate_filter == False))
                sort_keys = ('id', )
                s.sort_on(patients_dest,
                          patients_dest,
                          sort_keys,
                          write_mode='overwrite')

            # Patient processing
            # ==================
            if year_from_age:
                log("year of birth -> age; 18 to 90 filter")
                t0 = time.time()
                yobs = s.get(patients_dest['year_of_birth'])
                yob_filter = s.get(patients_dest['year_of_birth_valid'])
                age = s.create_numeric(patients_dest, 'age', 'uint32')
                age_filter = s.create_numeric(patients_dest, 'age_filter',
                                              'bool')
                age_16_to_90 = s.create_numeric(patients_dest,
                                                '16_to_90_years', 'bool')
                print('year_of_birth:', patients_dest['year_of_birth'])
                for k in patients_dest['year_of_birth'].attrs.keys():
                    print(k, patients_dest['year_of_birth'].attrs[k])
                calculate_age_from_year_of_birth_v1(yobs, yob_filter, 16, 90,
                                                    age, age_filter,
                                                    age_16_to_90, 2020)
                log(f"completed in {time.time() - t0}")

                print('age_filter count:',
                      np.sum(patients_dest['age_filter']['values'][:]))
                print('16_to_90_years count:',
                      np.sum(patients_dest['16_to_90_years']['values'][:]))

            if clean_weight_height_bmi:
                log("height / weight / bmi; standard range filters")
                t0 = time.time()

                weights_clean = s.create_numeric(patients_dest,
                                                 'weight_kg_clean', 'float32')
                weights_filter = s.create_numeric(patients_dest,
                                                  '40_to_200_kg', 'bool')
                heights_clean = s.create_numeric(patients_dest,
                                                 'height_cm_clean', 'float32')
                heights_filter = s.create_numeric(patients_dest,
                                                  '110_to_220_cm', 'bool')
                bmis_clean = s.create_numeric(patients_dest, 'bmi_clean',
                                              'float32')
                bmis_filter = s.create_numeric(patients_dest, '15_to_55_bmi',
                                               'bool')

                weight_height_bmi_v1(s, 40, 200, 110, 220, 15, 55, None, None,
                                     None, None, patients_dest['weight_kg'],
                                     patients_dest['weight_kg_valid'],
                                     patients_dest['height_cm'],
                                     patients_dest['height_cm_valid'],
                                     patients_dest['bmi'],
                                     patients_dest['bmi_valid'], weights_clean,
                                     weights_filter, None, heights_clean,
                                     heights_filter, None, bmis_clean,
                                     bmis_filter, None)
                log(f"completed in {time.time() - t0}")

            if health_worker_with_contact:
                with utils.Timer("health_worker_with_contact field"):
                    #writer = ds.get_categorical_writer(patients_dest, 'health_worker_with_contact', 'int8')
                    combined_hcw_with_contact_v1(
                        s, s.get(patients_dest['healthcare_professional']),
                        s.get(patients_dest['contact_health_worker']),
                        s.get(patients_dest['is_carer_for_community']),
                        patients_dest, 'health_worker_with_contact')

    # assessments =============================================================

    sorted_assessments_src = None
    if has_assessments:
        assessments_src = dataset['assessments']
        if 'assessments' not in destination.keys():
            assessments_dest = s.get_or_create_group(destination,
                                                     'assessments')
            sorted_assessments_src = assessments_dest

            if sort_assessments:
                sort_keys = ('patient_id', 'created_at')
                with utils.Timer("sorting assessments"):
                    s.sort_on(assessments_src, assessments_dest, sort_keys)

            if has_patients:
                if make_assessment_patient_id_fkey:
                    print(
                        "creating 'assessment_patient_id_fkey' foreign key index for 'patient_id'"
                    )
                    t0 = time.time()
                    patient_ids = s.get(sorted_patients_src['id'])
                    assessment_patient_ids =\
                        s.get(sorted_assessments_src['patient_id'])
                    assessment_patient_id_fkey =\
                        s.create_numeric(assessments_dest, 'assessment_patient_id_fkey', 'int64')
                    s.get_index(patient_ids.data[:],
                                assessment_patient_ids.data[:],
                                assessment_patient_id_fkey)
                    print(f"completed in {time.time() - t0}s")

            if clean_temperatures:
                print("clean temperatures")
                t0 = time.time()
                temps = s.get(sorted_assessments_src['temperature'])
                temp_units = s.get(sorted_assessments_src['temperature_unit'])
                temps_valid = s.get(
                    sorted_assessments_src['temperature_valid'])
                dest_temps = temps.create_like(assessments_dest,
                                               'temperature_c_clean')
                dest_temps_valid = temps_valid.create_like(
                    assessments_dest, 'temperature_35_to_42_inclusive')
                dest_temps_modified = temps_valid.create_like(
                    assessments_dest, 'temperature_modified')
                validate_temperature_v1(s, 35.0, 42.0, temps, temp_units,
                                        temps_valid, dest_temps,
                                        dest_temps_valid, dest_temps_modified)
                print(f"temperature cleaning done in {time.time() - t0}")

            if check_symptoms:
                print('check inconsistent health_status')
                t0 = time.time()
                check_inconsistent_symptoms_v1(s, sorted_assessments_src,
                                               assessments_dest)
                print(time.time() - t0)

    # tests ===================================================================

    if has_tests:
        if sort_tests:
            tests_src = dataset['tests']
            tests_dest = s.get_or_create_group(destination, 'tests')
            sort_keys = ('patient_id', 'created_at')
            s.sort_on(tests_src, tests_dest, sort_keys)

    # diet ====================================================================

    if has_diet:
        diet_src = dataset['diet']
        if 'diet' not in destination.keys():
            diet_dest = s.get_or_create_group(destination, 'diet')
            sorted_diet_src = diet_dest
            if sort_diet:
                sort_keys = ('patient_id', 'display_name', 'id')
                s.sort_on(diet_src, diet_dest, sort_keys)

    if has_assessments:
        if do_daily_asmts:
            daily_assessments_dest = s.get_or_create_group(
                destination, 'daily_assessments')

    # post process patients
    # TODO: need an transaction table

    print(patients_src.keys())
    print(dataset['assessments'].keys())
    print(dataset['tests'].keys())

    # write_mode = 'overwrite'
    write_mode = 'write'

    # Daily assessments
    # =================

    if has_assessments:
        if create_daily:
            print("generate daily assessments")
            patient_ids = s.get(sorted_assessments_src['patient_id'])
            created_at_days = s.get(sorted_assessments_src['created_at_day'])
            raw_created_at_days = created_at_days.data[:]

            if 'assessment_patient_id_fkey' in assessments_src.keys():
                patient_id_index = assessments_src[
                    'assessment_patient_id_fkey']
            else:
                patient_id_index = assessments_dest[
                    'assessment_patient_id_fkey']
            patient_id_indices = s.get(patient_id_index)
            raw_patient_id_indices = patient_id_indices.data[:]

            print("Calculating patient id index spans")
            t0 = time.time()
            patient_id_index_spans = s.get_spans(
                fields=(raw_patient_id_indices, raw_created_at_days))
            print(
                f"Calculated {len(patient_id_index_spans)-1} spans in {time.time() - t0}s"
            )

            print("Applying spans to 'health_status'")
            t0 = time.time()
            default_behavour_overrides = {
                'id': s.apply_spans_last,
                'patient_id': s.apply_spans_last,
                'patient_index': s.apply_spans_last,
                'created_at': s.apply_spans_last,
                'created_at_day': s.apply_spans_last,
                'updated_at': s.apply_spans_last,
                'updated_at_day': s.apply_spans_last,
                'version': s.apply_spans_max,
                'country_code': s.apply_spans_first,
                'date_test_occurred': None,
                'date_test_occurred_guess': None,
                'date_test_occurred_day': None,
                'date_test_occurred_set': None,
            }
            for k in sorted_assessments_src.keys():
                t1 = time.time()
                reader = s.get(sorted_assessments_src[k])
                if k in default_behavour_overrides:
                    apply_span_fn = default_behavour_overrides[k]
                    if apply_span_fn is not None:
                        apply_span_fn(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    else:
                        print(f"  Skipping field {k}")
                else:
                    if isinstance(reader, fields.CategoricalField):
                        s.apply_spans_max(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    elif isinstance(reader, rw.IndexedStringReader):
                        s.apply_spans_concat(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    elif isinstance(reader, rw.NumericReader):
                        s.apply_spans_max(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    else:
                        print(f"  No function for {k}")

            print(f"apply_spans completed in {time.time() - t0}s")

    if has_patients and has_assessments:
        if make_patient_level_assessment_metrics:
            if 'assessment_patient_id_fkey' in assessments_dest:
                src = assessments_dest['assessment_patient_id_fkey']
            else:
                src = assessments_src['assessment_patient_id_fkey']
            assessment_patient_id_fkey = s.get(src)
            # generate spans from the assessment-space patient_id foreign key
            spans = s.get_spans(field=assessment_patient_id_fkey.data[:])

            ids = s.get(patients_dest['id'])

            print('calculate assessment counts per patient')
            t0 = time.time()
            writer = s.create_numeric(patients_dest, 'assessment_count',
                                      'uint32')
            aggregated_counts = s.apply_spans_count(spans)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated assessment counts per patient in {time.time() - t0}"
            )

            print('calculate first assessment days per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['created_at_day'])
            writer = s.create_fixed_string(patients_dest,
                                           'first_assessment_day', 10)
            aggregated_counts = s.apply_spans_first(spans, reader)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated first assessment days per patient in {time.time() - t0}"
            )

            print('calculate last assessment days per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['created_at_day'])
            writer = s.create_fixed_string(patients_dest,
                                           'last_assessment_day', 10)
            aggregated_counts = s.apply_spans_last(spans, reader)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated last assessment days per patient in {time.time() - t0}"
            )

            print('calculate maximum assessment test result per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['tested_covid_positive'])
            writer = reader.create_like(patients_dest,
                                        'max_assessment_test_result')
            max_result_value = s.apply_spans_max(spans, reader)
            s.join(ids, assessment_patient_id_fkey, max_result_value, writer,
                   spans)
            print(
                f"calculated maximum assessment test result in {time.time() - t0}"
            )

    if has_assessments and do_daily_asmts and make_patient_level_daily_assessment_metrics:
        print(
            "creating 'daily_assessment_patient_id_fkey' foreign key index for 'patient_id'"
        )
        t0 = time.time()
        patient_ids = s.get(sorted_patients_src['id'])
        daily_assessment_patient_ids =\
            s.get(daily_assessments_dest['patient_id'])
        daily_assessment_patient_id_fkey =\
            s.create_numeric(daily_assessments_dest, 'daily_assessment_patient_id_fkey', 'int64')
        s.get_index(patient_ids, daily_assessment_patient_ids,
                    daily_assessment_patient_id_fkey)
        print(f"completed in {time.time() - t0}s")

        spans = s.get_spans(field=s.get(
            daily_assessments_dest['daily_assessment_patient_id_fkey']))

        print('calculate daily assessment counts per patient')
        t0 = time.time()
        writer = s.create_numeric(patients_dest, 'daily_assessment_count',
                                  'uint32')
        aggregated_counts = s.apply_spans_count(spans)
        daily_assessment_patient_id_fkey =\
            s.get(daily_assessments_dest['daily_assessment_patient_id_fkey'])
        s.join(ids, daily_assessment_patient_id_fkey, aggregated_counts,
               writer, spans)
        print(
            f"calculated daily assessment counts per patient in {time.time() - t0}"
        )

    if has_tests and make_new_test_level_metrics:
        print(
            "creating 'test_patient_id_fkey' foreign key index for 'patient_id'"
        )
        t0 = time.time()
        patient_ids = s.get(sorted_patients_src['id'])
        test_patient_ids = s.get(tests_dest['patient_id'])
        test_patient_id_fkey = s.create_numeric(tests_dest,
                                                'test_patient_id_fkey',
                                                'int64')
        s.get_index(patient_ids, test_patient_ids, test_patient_id_fkey)
        test_patient_id_fkey = s.get(tests_dest['test_patient_id_fkey'])
        spans = s.get_spans(field=test_patient_id_fkey)
        print(f"completed in {time.time() - t0}s")

        print('calculate test_counts per patient')
        t0 = time.time()
        writer = s.create_numeric(patients_dest, 'test_count', 'uint32')
        aggregated_counts = s.apply_spans_count(spans)
        s.join(ids, test_patient_id_fkey, aggregated_counts, writer, spans)
        print(f"calculated test counts per patient in {time.time() - t0}")

        print('calculate test_result per patient')
        t0 = time.time()
        test_results = s.get(tests_dest['result'])
        writer = test_results.create_like(patients_dest, 'max_test_result')
        aggregated_results = s.apply_spans_max(spans, test_results)
        s.join(ids, test_patient_id_fkey, aggregated_results, writer, spans)
        print(f"calculated max_test_result per patient in {time.time() - t0}")

    if has_diet and make_diet_level_metrics:
        with utils.Timer("Making patient-level diet questions count",
                         new_line=True):
            d_pids_ = s.get(diet_dest['patient_id']).data[:]
            d_pid_spans = s.get_spans(d_pids_)
            d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_)
            d_pid_counts = s.apply_spans_count(d_pid_spans)
            p_diet_counts = s.create_numeric(patients_dest, 'diet_counts',
                                             'int32')
            s.merge_left(left_on=s.get(patients_dest['id']).data[:],
                         right_on=d_distinct_pids,
                         right_fields=(d_pid_counts, ),
                         right_writers=(p_diet_counts, ))
예제 #11
0
def journal_table(session, schema, old_src, new_src, src_pk, result):
    old_keys = set(old_src.keys())
    new_keys = set(new_src.keys())

    common_keys = old_keys.intersection(new_keys)
    common_keys.remove('j_valid_from')
    common_keys.remove('j_valid_to')
    old_only_keys = old_keys.difference(new_keys)
    new_only_keys = new_keys.difference(old_keys)

    with utils.Timer("sorting old ids"):
        old_ids = session.get(old_src[src_pk])
        old_ids_ = old_ids.data[:]
        old_ids_valid_from = session.get(old_src['j_valid_from']).data[:]
        old_sorted_index = session.dataset_sort_index((old_ids_, old_ids_valid_from))
    old_count = len(old_ids_)

    with utils.Timer("sorting new_ids"):
        new_ids_ = session.get(new_src[src_pk]).data[:]
        new_sorted_index = session.dataset_sort_index((new_ids_,))
    new_count = len(new_ids_)

    # print("old_ids:", old_ids_[old_sorted_index[:20]])
    # print("new_ids:", new_ids_[new_sorted_index[:20]])

    # get the row maps for rows that we need to compare
    with utils.Timer("generating row_maps for merging"):
        old_ids_ = old_ids_[old_sorted_index]
        new_ids_ = new_ids_[new_sorted_index]
        old_map, new_map = ops.ordered_generate_journalling_indices(old_ids_, new_ids_)

    to_keep = np.zeros(len(old_map), dtype=np.bool)

    schema_fields = schema.fields.keys()
    common_keys = [k for k in schema_fields if k in common_keys]
    print("old_map:", old_map)
    print("new_map:", new_map)

    for k in common_keys:
        if k in (src_pk, 'j_valid_from', 'j_valid_to'):
            continue
        old_f = session.get(old_src[k])
        new_f = session.get(new_src[k])
        print(k)
        if isinstance(old_f, flds.IndexedStringField):
            old_f_i_, old_f_v_ = session.apply_index(old_sorted_index, old_f)
            new_f_i_, new_f_v_ = session.apply_index(new_sorted_index, new_f)
            ops.compare_indexed_rows_for_journalling(old_map, new_map,
                                                     old_f_i_, old_f_v_, new_f_i_, new_f_v_,
                                                     to_keep)
        else:
            old_f_ = session.apply_index(old_sorted_index, old_f)
            new_f_ = session.apply_index(new_sorted_index, new_f)
            ops.compare_rows_for_journalling(old_map, new_map, old_f_, new_f_, to_keep)

        print("to_keep:", to_keep.astype(np.uint8))
        print(to_keep.sum(), len(to_keep))

    merged_length = len(old_ids.data) + to_keep.sum()

    only_in_old = 0
    only_in_new = 0
    not_updated = 0
    updated = 0
    for i in range(len(old_map)):
        if old_map[i] == -1:
            only_in_new += 1
        if new_map[i] == -1:
            only_in_old += 1
        if (old_map[i] != -1) and (to_keep[i] == True):
            updated += 1
        if (new_map[i] != -1) and (to_keep[i] == False):
            not_updated += 1

    for k in common_keys:
        if k in (src_pk, 'j_valid_from', 'j_valid_to'):
            continue
        old_f = session.get(old_src[k])
        new_f = session.get(new_src[k])
        print(k)
        if isinstance(old_f, flds.IndexedStringField):
            old_f_i_, old_f_v_ = session.apply_index(old_sorted_index, old_f)
            new_f_i_, new_f_v_ = session.apply_index(new_sorted_index, new_f)
            dest_i_ = np.zeros(merged_length + 1, old_f_i_.dtype)
            val_count = ops.merge_indexed_journalled_entries_count(old_map, new_map, to_keep,
                                                                   old_f_i_, new_f_i_)
            dest_v_ = np.zeros(val_count, old_f_v_.dtype)
            ops.merge_indexed_journalled_entries(old_map, new_map, to_keep,
                                                 old_f_i_, old_f_v_, new_f_i_, new_f_v_,
                                                 dest_i_, dest_v_)
            dest_f = new_f.create_like(result, k)
            dest_f.indices.write(dest_i_)
            dest_f.values.write(dest_v_)

        else:
            old_f_v_ = session.apply_index(old_sorted_index, old_f)
            new_f_v_ = session.apply_index(new_sorted_index, new_f)
            dest_ = np.zeros(merged_length, old_f_v_.dtype)
            ops.merge_journalled_entries(old_map, new_map, to_keep, old_f_v_, new_f_v_, dest_)
            dest_f = new_f.create_like(result, k)
            dest_f.data.write(dest_)

    print("old_count:", old_count)
    print("new_count:", new_count)
    print("only in old:", only_in_old)
    print("only in new:", only_in_new)
    print("updated:", updated)
    print("not updated:", not_updated)
    print("post journal count:", merged_length)
예제 #12
0
def method_paper_prediction_pipeline(ds, src_data, dest_data, first_timestamp, last_timestamp):
    s_ptnts = src_data['patients']
    s_asmts = src_data['assessments']
    s_tests = src_data['tests']

    first_dt = datetime.fromtimestamp(first_timestamp)
    last_dt = datetime.fromtimestamp(last_timestamp)
    print(s_tests.keys())

    # Filter patients to be only from England
    # =======================================

    eng_pats = set()
    p_ids_ = ds.get_reader(s_ptnts['id'])[:]
    p_lsoas_ = ds.get_reader(s_ptnts['lsoa11cd'])[:]
    for i in range(len(p_ids_)):
        lsoa = p_lsoas_[i]
        if len(lsoa) > 0 and lsoa[0] == 69: # E
            eng_pats.add(p_ids_[i])
    print("eng pats:", len(eng_pats))

    if "flat_asmts" not in dest_data.keys():
        flat_tests = dest_data.create_group('flat_tests')

        # Filter tests
        # ============

        t_cats = ds.get_reader(s_tests['created_at'])
        raw_t_cats = t_cats[:]
        t_dts = ds.get_reader(s_tests['date_taken_specific'])
        raw_t_dts = t_dts[:]
        t_dsbs = ds.get_reader(s_tests['date_taken_between_start'])
        raw_t_dsbs = t_dsbs[:]
        t_dsbe = ds.get_reader(s_tests['date_taken_between_end'])
        raw_t_dsbe = t_dsbe[:]

        # remove non GB tests
        cur_filter = (ds.get_reader(s_tests['country_code'])[:] == b'GB')
        test_filter = cur_filter[:]
        print("standard test filter GB:", np.count_nonzero(test_filter), len(test_filter))

        # remove non england tests
        t_pids_ = ds.get_reader(s_tests['patient_id'])[:]
        cur_filter = np.zeros(len(t_pids_), dtype=np.bool)
        for i in range(len(t_pids_)):
            cur_filter[i] = t_pids_[i] in eng_pats
        test_filter = test_filter & cur_filter
        print("standard test filter Eng:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where no dates are set
        cur_filter = np.logical_not((raw_t_dts == 0) & (raw_t_dsbs == 0) & (raw_t_dsbe == 0))
        test_filter = test_filter & cur_filter
        print("standard test filter 1:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where all three dates are set
        cur_filter = np.logical_not((raw_t_dts != 0) & (raw_t_dsbs != 0) & (raw_t_dsbe != 0))
        test_filter = test_filter & cur_filter
        print("standard test filter 2:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where only one of the date range tests is set
        cur_filter = np.logical_not((raw_t_dsbs != 0) & (raw_t_dsbe == 0) |
                                    (raw_t_dsbs == 0) & (raw_t_dsbe != 0))
        test_filter = test_filter & cur_filter
        print("standard test filter 3:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where specific date is set but out of range
        cur_filter =\
            (raw_t_dts == 0) | ((raw_t_dts >= first_timestamp) & (raw_t_dts <= last_timestamp))
        test_filter = test_filter & cur_filter
        print("standard test filter 4:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where beginning date is set but out of range
        cur_filter =\
            (raw_t_dsbs == 0) | ((raw_t_dsbs >= first_timestamp) & (raw_t_dsbs <= last_timestamp))
        test_filter = test_filter & cur_filter
        print("standard test filter 5:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where ending date is set but out of range
        cur_filter = \
            (raw_t_dsbe == 0) | ((raw_t_dsbe >= first_timestamp) & (raw_t_dsbe <= last_timestamp))
        test_filter = test_filter & cur_filter
        print("standard test filter 6:", np.count_nonzero(test_filter), len(test_filter))

        test_timestamps = np.where(raw_t_dts != 0,
                                   raw_t_dts,
                                   raw_t_dsbs + (raw_t_dsbe - raw_t_dsbs) / 2)

        # remove tests where the test date is after the created at date
        cur_filter = test_timestamps <= raw_t_cats
        test_filter = test_filter & cur_filter
        print("standard test filter 7:", np.count_nonzero(test_filter), len(test_filter))

        t_rsts = ds.get_reader(s_tests['result'])
        t_rsts.get_writer(flat_tests, 'result').write(ds.apply_filter(test_filter, t_rsts))
        t_pids = ds.get_reader(s_tests['patient_id'])
        t_pids.get_writer(flat_tests, 'patient_id').write(ds.apply_filter(test_filter, t_pids))
        ds.get_timestamp_writer(flat_tests, 'eff_test_date').write(
            ds.apply_filter(test_filter, test_timestamps))

        # test_min_ts = datetime.fromtimestamp(test_timestamps[test_filter].min())
        # test_max_ts = datetime.fromtimestamp(test_timestamps[test_filter].max())
        # print(test_min_ts, test_max_ts)
    else:
        flat_tests = dest_data["flat_tests"]


    symptoms = ('persistent_cough', 'fatigue', 'delirium', 'shortness_of_breath', 'fever',
                'diarrhoea', 'abdominal_pain', 'chest_pain', 'hoarse_voice', 'skipped_meals',
                'loss_of_smell')

    if "flat_asmts" not in dest_data.keys():
        flat_asmts = dest_data.create_group('flat_asmts')

        # Filter assessments
        # ------------------

        symptom_thresholds = {s: 2 for s in symptoms}
        symptom_thresholds['fatigue'] = 3
        symptom_thresholds['shortness_of_breath'] = 3

        with utils.Timer("filter all out of date range assessments and non-uk assessments", new_line=True):
            a_cats = ds.get_reader(s_asmts['created_at'])[:]
            # in_date_range = (a_cats >= first_timestamp) & (a_cats < last_timestamp)
            in_date_range = a_cats >= first_timestamp
            in_date_range = in_date_range & (ds.get_reader(s_asmts['country_code'])[:] == b'GB')

            a_pids = ds.get_reader(s_asmts['patient_id'])[:]
            in_eng = np.zeros(len(a_pids), dtype=np.bool)
            for i in range(len(a_pids)):
                if a_pids[i] in eng_pats:
                    in_eng[i] = True
            print("in_eng:", in_eng.sum(), len(in_eng))
            in_date_range = in_date_range & in_eng

        with utils.Timer("get indices of final assessments of each day for each person"):
            f_a_pids = ds.apply_filter(in_date_range, a_pids)
            f_a_catds = ds.apply_filter(in_date_range, ds.get_reader(s_asmts['created_at_day'])[:])
            spans = ds.get_spans(f_a_pids)

            last_daily_asmt_filter = np.zeros(len(f_a_pids), dtype=np.bool)
            for s in range(len(spans)-1):
                sb = spans[s]
                se = spans[s+1]
                subspans = ds.get_spans(f_a_catds[sb:se])
                if s < 3:
                    print(subspans)
                for s2 in range(1, len(subspans)):
                    last_daily_asmt_filter[sb + subspans[s2]-1] = True
            print("last_daily_asmt_filter:", last_daily_asmt_filter.sum())
            print(last_daily_asmt_filter[:50])

            # otherspans = ds.get_spans(f_a_catds)
            # last_daily_asmts = np.zeros(len(otherspans)-1, dtype='int64')
            # ds.apply_spans_index_of_last(otherspans, last_daily_asmts)
            # print("last_daily_asmts:", len(last_daily_asmts))

        # pc = ds.get_reader(s_asmts['persistent_cough'])[:]
        # pc1 = ds.apply_indices(last_daily_asmts, ds.apply_filter(in_date_range, pc))
        # pc2 = ds.apply_indices(last_daily_asmts, pc)
        # print(len(pc1), len(pc2))
        # print(np.array_equal(pc1, pc2))


        with utils.Timer("flattening and filtering symptoms"):
            for s in symptoms:
                reader = ds.get_reader(s_asmts[s])
                writer = ds.get_numeric_writer(flat_asmts, s, 'bool')
                filtered = ds.apply_filter(last_daily_asmt_filter, ds.apply_filter(in_date_range, reader[:]))
                writer.write(filtered >= symptom_thresholds[s])

        with utils.Timer("flattening and filtering other fields", new_line=True):
            for f in ('id', 'patient_id', 'created_at', 'created_at_day', 'tested_covid_positive'):
                reader = ds.get_reader(s_asmts[f])
                writer = reader.get_writer(flat_asmts, f)
                ds.apply_filter(in_date_range, reader, writer)
                reader = ds.get_reader(flat_asmts[f])
                writer = reader.get_writer(flat_asmts, f, write_mode='overwrite')
                ds.apply_filter(last_daily_asmt_filter, reader, writer)
                print("  {}".format(f), len(ds.get_reader(flat_asmts[f])))

        # telemetry only
        for s in symptoms:
            print(s, len(ds.get_reader(flat_asmts[s])),
                  np.count_nonzero(ds.get_reader(flat_asmts[s])[:]))
    else:
        flat_asmts = dest_data["flat_asmts"]


    # Filter tests
    # ------------

    # # filter tests within day range first
    # t_cats = ds.get_reader(s_tests['created_at'])
    # raw_t_cats = t_cats[:]
    # t_rsts = ds.get_reader(s_tests['result'])
    # t_pids = ds.get_reader(s_tests['patient_id'])
    # # test_date_filter = (raw_t_cats >= first_timestamp) & (raw_t_cats < last_timestamp)
    # test_date_filter = raw_t_cats >= first_timestamp
    # test_date_filter = test_date_filter & (ds.get_reader(s_tests['country_code'])[:] == b'GB')
    # t_cats.get_writer(flat_tests, 'created_at').write(ds.apply_filter(test_date_filter, raw_t_cats))
    # t_rsts.get_writer(flat_tests, 'result').write(ds.apply_filter(test_date_filter, t_rsts))
    # t_pids.get_writer(flat_tests, 'patient_id').write(ds.apply_filter(test_date_filter, t_pids))
    #
    # raw_t_cats = ds.get_reader(flat_tests['created_at'])[:]
    # min_test_day = datetime.fromtimestamp(np.min(raw_t_cats))
    # max_test_day = datetime.fromtimestamp(np.max(raw_t_cats))
    # print(min_test_day, max_test_day)

    # Calculate prevalence
    # --------------------

    if 'prediction' not in flat_asmts:
        intercept = -1.19015973
        weights = {'persistent_cough': 0.23186655,
                   'fatigue': 0.56532346,
                   'delirium': -0.12935112,
                   'shortness_of_breath': 0.58273967,
                   'fever': 0.16580974,
                   'diarrhoea': 0.10236126,
                   'abdominal_pain': -0.11204163,
                   'chest_pain': -0.12318634,
                   'hoarse_voice': -0.17818597,
                   'skipped_meals': 0.25902482,
                   'loss_of_smell': 1.82895239}

        with utils.Timer("predicting covid by assessment", new_line=True):
            cumulative = np.zeros(len(ds.get_reader(flat_asmts['persistent_cough'])), dtype='float64')
            for s in symptoms:
                reader = ds.get_reader(flat_asmts[s])
                cumulative += reader[:] * weights[s]
            cumulative += intercept
            print("  {}".format(len(cumulative)))
            ds.get_numeric_writer(flat_asmts, 'prediction', 'float32', writemode='overwrite').write(cumulative)
            pos_filter = cumulative > 0.0
            print("pos_filter: ", np.count_nonzero(pos_filter), len(pos_filter))
    else:
        cumulative = ds.get_reader(flat_asmts['prediction'])[:]

    # apply
    # positive test -> imputed positive -> negative test
    spans = ds.get_spans(ds.get_reader(flat_asmts['patient_id'])[:])
    print('spans:', len(spans))

    # generate a numpy array for each day, where each entry in the array is a patient with
    # assessments still in the dataset after the initial filter

    daydict = defaultdict(int)
    with utils.Timer("checking date deltas", new_line=True):
        a_cats = ds.get_reader(flat_asmts['created_at'])[:]
        first_day = datetime.fromtimestamp(first_timestamp)
        for i_r in range(len(a_cats)):
            daydict[(datetime.fromtimestamp(a_cats[i_r]) - first_day).days] += 1
        sdaydict = sorted(daydict.items())
        print(sdaydict)

    # build a combined id index for assessments and tests
    # ---------------------------------------------------
    remaining_a_pids = ds.get_reader(flat_asmts['patient_id'])[:]
    remaining_t_pids = ds.get_reader(flat_tests['patient_id'])[:]
    print("pids from assessments and tests:", len(remaining_a_pids), len(remaining_t_pids),
          len(set(remaining_a_pids).union(set(remaining_t_pids))))
    a_pid_index, t_pid_index = ds.get_shared_index((remaining_a_pids, remaining_t_pids))
    print("merging indices:", len(a_pid_index), len(t_pid_index), max(np.max(a_pid_index), np.max(t_pid_index)))

    max_index = max(a_pid_index[-1], t_pid_index[-1])
    print('max indices:', a_pid_index[-1], t_pid_index[-1])


    # calculate offset days for assessments
    # -------------------------------------

    first_day = datetime.fromtimestamp(first_timestamp)
    a_cats = ds.get_reader(flat_asmts['created_at'])[:]
    a_tcps = ds.get_reader(flat_asmts['tested_covid_positive'])[:]
    a_offset_days = np.zeros(len(a_cats), dtype='int16')

    with utils.Timer("calculate offset days for assessments", new_line=True):
        for i_r, r in enumerate(a_cats):
            a_offset_days[i_r] = (datetime.fromtimestamp(a_cats[i_r]) - first_day).days
        print("assessment_dates:", sorted(utils.build_histogram(a_offset_days)))


    # calculate offset days for tests
    # -------------------------------

    t_etds = ds.get_reader(flat_tests['eff_test_date'])
    raw_t_etds = t_etds[:]
    t_rsts = ds.get_reader(flat_tests['result'])
    t_pids = ds.get_reader(flat_tests['patient_id'])

    t_offset_days = np.zeros(len(raw_t_etds), dtype='int16')
    t_offset_dates = [None] * len(raw_t_etds)
    for i_r, r in enumerate(raw_t_etds):
        t_offset_days[i_r] = (datetime.fromtimestamp(raw_t_etds[i_r]) - first_day).days
        t_offset_dates[i_r] = datetime.fromtimestamp(raw_t_etds[i_r]).date()
    print("test_dates:", sorted(utils.build_histogram(t_offset_days)))
    print("test_dates2:", sorted(utils.build_histogram(t_offset_dates)))


    # create the destination arrays to hold daily data per patient
    # ------------------------------------------------------------
    daycount = max(a_offset_days.max(), t_offset_days.max()) + 1
    i_days = list([None] * daycount)
    t_days = list([None] * daycount)
    print("daycount:", daycount)
    for i in range(daycount):
        i_days[i] = np.zeros(max_index+1, dtype='int16')
        t_days[i] = np.zeros(max_index+1, dtype='int16')


    # incorporate assessment predictions and positive test results
    # note: a_offset_days is in assessment space
    print("len(a_offset_days):", len(a_offset_days))
    print("len(a_pid_index):", len(a_pid_index))
    with utils.Timer("incorporating assessments and assessment-based tests"):
        for i_r, r in enumerate(a_offset_days):
            # i_days[a_offset_days[i_r]][a_pid_index[i_r]] =\
            #     from_tcp if from_tcp != 0 else from_prediction
            from_prediction = 7 if cumulative[i_r] > 0.0 else -7
            i_days[a_offset_days[i_r]][a_pid_index[i_r]] = from_prediction
            from_tcp = 7 if a_tcps[i_r] == 3 else -7 if a_tcps[i_r] == 2 else 0
            t_days[a_offset_days[i_r]][a_pid_index[i_r]] = from_tcp

    # incorporate test results by to appropriate day's entry
    with utils.Timer("incorporating test_results"):
        for i_r, r in enumerate(t_offset_days):
            day = t_days[t_offset_days[i_r]]
            if t_rsts[i_r] == 4:
                day[t_pid_index[i_r]] = 7
            elif t_rsts[i_r] == 3:
                day[t_pid_index[i_r]] = -7
            # day[t_pid_index[i_r]] = 7 if t_rsts[i_r] == 4 else -7 if t_rsts[i_r] == 3 else 0
            # if day[t_pid_index[i_r]] == 0:
            #     day[t_pid_index[i_r]] = 7 if t_rsts[i_r] == 4 else -7 if t_rsts[i_r] == 3 else 0
            # else:
            #     day[t_pid_index[i_r]] = 7 if t_rsts[i_r] == 4 else max(day[t_pid_index[i_r]], -7)

    for i_d, d in enumerate(i_days):
        print(i_d, np.count_nonzero(d))

    with utils.Timer("calculating progression"):
        for da in (i_days, t_days):
            for i_d in range(len(da)-1):
                prior_d = da[i_d]
                next_d = da[i_d + 1]
                next_d[:] = np.where(next_d != 0,
                                     next_d,
                                     np.where(prior_d > 0, prior_d-1, np.minimum(prior_d+1, 0)))
    for d in range(len(i_days)):
        i_d = i_days[d]
        t_d = t_days[d]
        i_present = np.count_nonzero(i_d != 0)
        i_positive = np.count_nonzero(i_d > 0)
        t_present = np.count_nonzero(t_d != 0)
        t_positive = np.count_nonzero(t_d > 0)
        c_d = np.where(t_d == 0, i_d, t_d)
        c_present = np.count_nonzero(c_d != 0)
        c_positive = np.count_nonzero(c_d > 0)

        day = first_day + timedelta(days=d)
        if c_present != 0:
            print(day, i_present, i_positive, t_present, t_positive, c_present, c_positive,
                  c_positive / c_present)
        else:
            print(day, i_present, i_positive, t_present, t_positive, c_present, c_positive,
                  "NA")
예제 #13
0
def method_paper_summary_pipeline(ds, src_data, dest_data, first_timestamp,
                                  last_timestamp):
    s_ptnts = src_data['patients']
    s_asmts = src_data['assessments']
    filters = ds.get_or_create_group(dest_data, 'filters')
    print(s_ptnts.keys())
    print(src_data['tests'].keys())

    conditions = ('has_kidney_disease', 'has_lung_disease',
                  'has_heart_disease', 'has_diabetes', 'has_hayfever',
                  'has_cancer')

    symptoms = ('persistent_cough', 'fatigue', 'delirium',
                'shortness_of_breath', 'fever', 'diarrhoea', 'abdominal_pain',
                'chest_pain', 'hoarse_voice', 'skipped_meals', 'loss_of_smell')
    symptom_thresholds = {s: 2 for s in symptoms}
    symptom_thresholds['fatigue'] = 3
    symptom_thresholds['shortness_of_breath'] = 3

    intercept = -1.19015973
    weights = {
        'persistent_cough': 0.23186655,
        'fatigue': 0.56532346,
        'delirium': -0.12935112,
        'shortness_of_breath': 0.58273967,
        'fever': 0.16580974,
        'diarrhoea': 0.10236126,
        'abdominal_pain': -0.11204163,
        'chest_pain': -0.12318634,
        'hoarse_voice': -0.17818597,
        'skipped_meals': 0.25902482,
        'loss_of_smell': 1.82895239
    }

    # Filter patients to be only from England
    # =======================================

    eng_pats = set()
    p_ids_ = ds.get_reader(s_ptnts['id'])[:]
    p_lsoas_ = ds.get_reader(s_ptnts['lsoa11cd'])[:]
    for i in range(len(p_ids_)):
        lsoa = p_lsoas_[i]
        if len(lsoa) > 0 and lsoa[0] == 69:  # E
            eng_pats.add(p_ids_[i])
    print("eng pats:", len(eng_pats))

    # generating patient filter
    # -------------------------
    if 'patient_filter' not in filters.keys():
        with utils.Timer("generating patient filter", new_line=True):
            p_filter = ds.get_reader(s_ptnts['year_of_birth_valid'])[:]

            # valid age ranges
            r_ = ds.get_reader(s_ptnts['age'])[:]
            f_ = (r_ >= 18) & (r_ <= 100)
            p_filter = p_filter & f_

            # gender filter
            r_ = ds.get_reader(s_ptnts['gender'])[:]
            f_ = (r_ == 1) | (r_ == 2)
            p_filter = p_filter & f_

            # country code
            r_ = ds.get_reader(s_ptnts['country_code'])[:]
            f_ = r_ == b'GB'
            p_filter = p_filter & f_
            print("UK:", p_filter.sum(), len(p_filter))

            # # England only
            # r_ = ds.get_reader(s_ptnts['lsoa11cd'])[:]
            # f_ = np.zeros(len(r_), dtype=np.bool)
            # for i in range(len(r_)):
            #     lsoa = r_[i]
            #     if len(lsoa) > 0 and lsoa[0] == 69: # E
            #         f_[i] = True
            # p_filter = p_filter & f_
            # print("Eng:", p_filter.sum(), len(p_filter))

            # no assessments
            r_ = ds.get_reader(s_ptnts['assessment_count'])[:]
            f_ = r_ > 0
            p_filter = p_filter & f_
            print("No asmts:", p_filter.sum(), len(p_filter))

            print("  {}, {}".format(np.count_nonzero(p_filter),
                                    np.count_nonzero(p_filter == False)))
            ds.get_numeric_writer(filters, 'patient_filter',
                                  'bool').write(p_filter)

    # generating assessment filter
    # ----------------------------
    if 'assessment_filter' not in filters.keys():
        with utils.Timer("generating assessment filter", new_line=True):
            a_filter = np.ones(len(ds.get_reader(s_asmts['id'])),
                               dtype=np.bool)

            # created_at in range
            r_ = ds.get_reader(s_asmts['created_at'])[:]
            f_ = (r_ >= first_timestamp) & (r_ < last_timestamp)
            a_filter = a_filter & f_

            # country code
            r_ = ds.get_reader(s_asmts['country_code'])[:]
            f_ = r_ == b'GB'
            a_filter = a_filter & f_

            with utils.Timer(f"filtering out orphaned assessments"):
                p_ids_ = ds.get_reader(s_ptnts['id'])[:]
                p_ids_ = ds.apply_filter(
                    ds.get_reader(filters['patient_filter'])[:], p_ids_)
                a_pids_ = ds.get_reader(s_asmts['patient_id'])[:]
                f_ = persistence.foreign_key_is_in_primary_key(p_ids_, a_pids_)
            a_filter = a_filter & f_

            print("  {}, {}".format(np.count_nonzero(a_filter),
                                    np.count_nonzero(a_filter == False)))
            ds.get_numeric_writer(filters, 'assessment_filter',
                                  'bool').write(a_filter)

    # filtering patients
    # ------------------
    if 'filtered_patients' not in dest_data.keys():
        flt_ptnts = dest_data.create_group('filtered_patients')
        with utils.Timer("filtering/flattening patient fields", new_line=True):
            p_filter = ds.get_reader(filters['patient_filter'])[:]

            r = ds.get_reader(s_ptnts['age'])
            r.get_writer(flt_ptnts,
                         'age').write(ds.apply_filter(p_filter, r[:]))

            for k in conditions:
                r = ds.get_reader(s_ptnts[k])
                ds.get_numeric_writer(
                    flt_ptnts, k,
                    'bool').write(ds.apply_filter(p_filter, r[:]) == 2)

            smoker1 = ds.get_reader(s_ptnts['is_smoker'])
            smoker2 = ds.get_reader(s_ptnts['smoker_status'])
            smoker = (smoker1[:] == 2) | (smoker2[:] == 3)
            ds.get_numeric_writer(flt_ptnts, 'smoker', 'bool').write(smoker)

            gender_ = ds.get_reader(s_ptnts['gender'])
            ds.get_numeric_writer(
                flt_ptnts, 'gender',
                'uint8').write(ds.apply_filter(p_filter, gender_) - 1)
    else:
        flt_ptnts = dest_data['filtered_patients']

    # filtering assessments
    # ---------------------
    if 'filtered_assessments' not in dest_data.keys():
        flt_asmts = dest_data.create_group('filtered_assessments')
        with utils.Timer("filtering/flattening symptoms", new_line=True):
            a_filter = ds.get_reader(filters['assessment_filter'])[:]
            for s in symptoms:
                r_ = ds.get_reader(s_asmts[s])[:]
                ds.get_numeric_writer(flt_asmts, s, 'bool').write(
                    ds.apply_filter(a_filter, r_) >= symptom_thresholds[s])
            a_pids = ds.get_reader(s_asmts['patient_id'])
            a_pids.get_writer(flt_asmts, 'patient_id').write(
                ds.apply_filter(a_filter, a_pids[:]))
    else:
        flt_asmts = dest_data['filtered_assessments']

    # predicting covid
    # ----------------
    if 'prediction' not in dest_data['filtered_assessments']:
        with utils.Timer("generating covid prediction", new_line=True):
            cumulative = np.zeros(len(
                ds.get_reader(flt_asmts['persistent_cough'])),
                                  dtype='float64')
            for s in symptoms:
                reader = ds.get_reader(flt_asmts[s])
                cumulative += reader[:] * weights[s]
            cumulative += intercept
            print("positive predictions", np.count_nonzero(cumulative > 0.0),
                  len(cumulative))

            a_pids_ = ds.get_reader(flt_asmts['patient_id'])[:]
            spans = ds.get_spans(a_pids_)
            max_prediction_inds = ds.apply_spans_index_of_max(
                spans, cumulative)
            max_predictions = cumulative[max_prediction_inds]

            ds.get_numeric_writer(flt_asmts, 'prediction',
                                  'float32').write(max_predictions)
            pos_filter = max_predictions > 0.0
            print("pos_filter: ", np.count_nonzero(pos_filter),
                  len(pos_filter))

    # generating table results
    print('total_assessments:',
          np.count_nonzero(ds.get_reader(filters['assessment_filter'])[:]))
    subjects = np.count_nonzero(ds.get_reader(filters['patient_filter'])[:])
    genders = ds.get_reader(flt_ptnts['gender'])[:]
    predicted_c19 = np.count_nonzero(
        ds.get_reader(flt_asmts['prediction'])[:] > 0.0)
    age_mean = np.mean(ds.get_reader(flt_ptnts['age'])[:])
    age_std = np.std(ds.get_reader(flt_ptnts['age'])[:])
    print('subjects:', subjects)
    male = np.count_nonzero(genders)
    female = np.count_nonzero(genders == False)
    print('gender: {}:{}, {:.2%}:{:.2%}'.format(male, female,
                                                male / len(genders),
                                                female / len(genders)))
    # print('predicted covid-19:', predicted_c19)
    print(
        '{}:'.format('predicted covid-19'), '{} {:.2%}'.format(
            predicted_c19,
            predicted_c19 / len(ds.get_reader(flt_asmts['prediction']))))
    print('age {:.2f} ({:.2f})'.format(age_mean, age_std))
    for k in conditions + ('smoker', ):
        kr_ = ds.get_reader(flt_ptnts[k])[:]
        pos = np.count_nonzero(kr_)
        print('{}:'.format(k), '{} {:.2%}'.format(pos, pos / len(kr_)))
예제 #14
0
def ppe_use_and_travel(ds, src, tmp, start_timestamp):

    logging = True

    s_asmts = src['assessments']

    if 'filtered_assessments' not in tmp.keys():
        f_asmts = tmp.create_group('filtered_assessments')
        cats = ds.get_reader(s_asmts['created_at'])
        asmt_filter = cats[:] >= start_timestamp

        ccs = ds.get_reader(s_asmts['country_code'])
        asmt_filter = asmt_filter & (ccs[:] == b'GB')

        symptom_keys = ('persistent_cough', 'fatigue', 'delirium',
                        'shortness_of_breath', 'fever', 'diarrhoea',
                        'abdominal_pain', 'chest_pain', 'hoarse_voice',
                        'skipped_meals', 'loss_of_smell')
        mask_keys = ('mask_cloth_or_scarf', 'mask_surgical', 'mask_n95_ffp')
        isolation_keys = ('isolation_healthcare_provider',
                          'isolation_little_interaction',
                          'isolation_lots_of_people')
        other_keys = ('patient_id', )
        symptom_thresholds = {s: 2 for s in symptom_keys}
        symptom_thresholds.update({m: 2 for m in mask_keys})
        symptom_thresholds['fatigue'] = 3
        symptom_thresholds['shortness_of_breath'] = 3

        for k in symptom_keys + mask_keys + isolation_keys + other_keys:
            with utils.Timer("filtering {}".format(k)):
                reader = ds.get_reader(s_asmts[k])
                if k in mask_keys + symptom_keys:
                    values = np.where(reader[:] >= symptom_thresholds[k], 1, 0)
                    ds.get_numeric_writer(f_asmts, k, 'int8').write(
                        ds.apply_filter(asmt_filter, values))
                    hist = np.unique(reader[:], return_counts=True)
                    print(sorted(zip(hist[0], hist[1])))
                    hist = np.unique(values, return_counts=True)
                    print(sorted(zip(hist[0], hist[1])))
                else:
                    reader.get_writer(f_asmts, k).write(
                        ds.apply_filter(asmt_filter, reader))

        print('filtered assessments:', np.count_nonzero(asmt_filter),
              len(asmt_filter))
        #
        #
        # if 'filtered_assessment_predictions' not in tmp.keys():
        #     f_pred_asmts = tmp.create_group('filtered_assessment_predictions')
        symptom_readers = dict()
        for s in symptom_keys:
            symptom_readers[s] = ds.get_reader(f_asmts[s])
        predictions = ds.get_numeric_writer(f_asmts, 'prediction', 'float32')
        method_paper_model(ds, symptom_readers, predictions)
        predictions = ds.get_reader(f_asmts['prediction'])
        print('predictions:', np.count_nonzero(predictions[:] > 0),
              len(predictions))

    if 'patient_assessment_summaries' not in tmp.keys():
        asmt_psum = tmp.create_group('patient_assessment_summaries')
        pids = ds.get_reader(f_asmts['patient_id'])
        mcos = ds.get_reader(f_asmts['mask_cloth_or_scarf'])
        msurg = ds.get_reader(f_asmts['mask_surgical'])
        m95 = ds.get_reader(f_asmts['mask_n95_ffp'])
        with utils.Timer("generating patient_id spans"):
            asmt_spans = ds.get_spans(field=pids[:])

        for k in mask_keys:
            with utils.Timer(
                    "getting per patient mask summary for {}".format(k)):
                writer = ds.get_numeric_writer(asmt_psum, k, 'int8')
                ds.apply_spans_max(asmt_spans,
                                   ds.get_reader(f_asmts[k])[:], writer)
                print(
                    sorted(
                        utils.build_histogram(ds.get_reader(asmt_psum[k])[:])))

        for k in isolation_keys:
            with utils.Timer(
                    "getting per patient isolation summary for {}".format(k)):
                writer = ds.get_numeric_writer(asmt_psum, k, 'int32')
                ds.apply_spans_max(asmt_spans,
                                   ds.get_reader(f_asmts[k])[:], writer)
                print(
                    sorted(
                        utils.build_histogram(ds.get_reader(asmt_psum[k])[:])))

        with utils.Timer("getting prediction maxes for patients"):
            p_predictions = predictions.get_writer(asmt_psum, 'prediction')
            ds.apply_spans_max(asmt_spans, predictions, p_predictions)
            p_predictions = ds.get_reader(asmt_psum[k])
            positives = p_predictions[:] > 0
            print("max covid prediction:", np.count_nonzero(positives),
                  len(positives))

        with utils.Timer("getting patient ids from assessments"):
            writer = pids.get_writer(asmt_psum, 'patient_id')
            writer.write(pd.unique(pids[:]))
    else:
        asmt_psum = tmp['patient_assessment_summaries']

    s_ptnts = src['patients']
    print(s_ptnts.keys())

    pdf = pd.DataFrame({
        'id':
        ds.get_reader(s_ptnts['id'])[:],
        'hwwc':
        ds.get_reader(s_ptnts['health_worker_with_contact'])[:]
    })
    adf = pd.DataFrame(
        {'patient_id': ds.get_reader(asmt_psum['patient_id'])[:]})
    jdf = pd.merge(left=adf,
                   right=pdf,
                   left_on='patient_id',
                   right_on='id',
                   how='left')
    print(len(jdf['hwwc']))

    class TestResults:
        def __init__(self):
            self.positive = 0
            self.total = 0

        def add(self, result):
            if result:
                self.positive += 1
            self.total += 1

    results = defaultdict(TestResults)
    positives = ds.get_reader(asmt_psum['prediction'])[:]
    positives = positives > 0
    mask_0 = ds.get_reader(asmt_psum['mask_cloth_or_scarf'])[:]
    mask_1 = ds.get_reader(asmt_psum['mask_surgical'])[:]
    mask_2 = ds.get_reader(asmt_psum['mask_cloth_or_scarf'])[:]
    # mask = mask_0 | mask_1 | mask_2
    mask = mask_0
    print(np.unique(mask, return_counts=True))
    isol_lots = ds.get_reader(asmt_psum['isolation_lots_of_people'])[:]
    isol_lots_7 = np.where(isol_lots > 7, 7, isol_lots)
    print(np.unique(isol_lots_7, return_counts=True))
    print(len(mask), len(positives), len(isol_lots_7))

    # isolation lots of users
    for i_r in range(len(mask)):
        results[(isol_lots_7[i_r], mask[i_r])].add(positives[i_r])

    groupings = sorted(
        list((r[0], (r[1].positive, r[1].total)) for r in results.items()))

    for g in groupings:
        print(g[0], g[1][0], g[1][1], g[1][0] / g[1][1])
예제 #15
0
def ppe_use_and_travel_2(ds, src, dest, start_ts):
    ds = session.Session()
    s_ptnts = src['patients']
    s_asmts = src['assessments']
    print(s_asmts.keys())
    s_tests = src['tests']

    if 'filtered_patients' not in dest.keys():
        f_ptnts = dest.create_group('filtered_patients')
        f_asmts = dest.create_group('filtered_assessments')
        f_tests = dest.create_group('filtered_tests')

        # calculate patient first positives
        raw_p_ids = ds.get(s_ptnts['id']).data[:]
        raw_p_acts = ds.get(s_ptnts['assessment_count']).data[:]
        raw_a_pids = ds.get(s_asmts['patient_id']).data[:]
        raw_t_pids = ds.get(s_tests['patient_id']).data[:]

        # filter out anyone without assessments
        patient_filter = raw_p_acts > 0

        print("patient_filter:", np.count_nonzero(patient_filter),
              np.count_nonzero(patient_filter == 0))

        # filter patients
        f_p_ids = ds.get(s_ptnts['id']).create_like(f_ptnts, 'id')
        f_p_ids.data.write(ds.apply_filter(patient_filter, raw_p_ids))

        # filter out any orphaned assessments
        with utils.Timer("fk in pk"):
            assessment_filter = persistence.foreign_key_is_in_primary_key(
                raw_p_ids, raw_a_pids)
        print("assessment_filter:", np.count_nonzero(assessment_filter),
              np.count_nonzero(assessment_filter == False))
        f_a_pids = ds.get(s_asmts['patient_id']).create_like(
            f_asmts, 'patient_id')
        f_a_pids.data.write(ds.apply_filter(assessment_filter, raw_a_pids))
        for k in ('created_at', 'tested_covid_positive'):
            field = ds.get(s_asmts[k]).create_like(f_asmts, k)
            field.data.write(
                ds.apply_filter(assessment_filter,
                                ds.get(s_asmts[k]).data[:]))

        # filter out any orphaned tests
        test_filter = persistence.foreign_key_is_in_primary_key(
            raw_p_ids, raw_t_pids)
        print("test_filter:", np.count_nonzero(test_filter),
              np.count_nonzero(test_filter == False))
        f_t_pids = ds.get(s_tests['patient_id']).create_like(
            f_tests, 'patient_id')
        f_t_pids.data.write(ds.apply_filter(test_filter, raw_t_pids))

    else:
        f_ptnts = dest['filtered_patients']
        f_asmts = dest['filtered_assessments']
        f_tests = dest['filtered_tests']
        f_p_ids = ds.get(f_ptnts['id'])
        f_a_pids = ds.get(f_asmts['patient_id'])
        f_t_pids = ds.get(f_tests['patient_id'])

    # calculate the shared set of indices for assessments / tests back to patients
    with utils.Timer("get_shared_index"):
        p_inds, a_pinds, t_pinds = ds.get_shared_index(
            (f_p_ids, f_a_pids, f_t_pids))
    print(max(p_inds.max(), a_pinds.max(), t_pinds.max()))

    # now filter only assessments with positive test results
    pos_asmt_tests = ds.get(f_asmts['tested_covid_positive']).data[:] == 3
    print("old tests positive:", np.count_nonzero(pos_asmt_tests),
          np.count_nonzero(pos_asmt_tests == False))

    # now filter only tests with positive test results

    s_asmts = src['assessments']
    a_cats = ds.get(f_asmts['created_at'])
    asmt_filter = a_cats.data[:] >= start_ts
    print(np.count_nonzero(asmt_filter), len(asmt_filter))
    raw_a_cats = ds.apply_filter(asmt_filter, a_cats.data[:])
    a_days = np.zeros(len(raw_a_cats), dtype=np.int32)
    start_dt = datetime.fromtimestamp(start_ts)
    for i_r in range(len(raw_a_cats)):
        a_days[i_r] = (datetime.fromtimestamp(raw_a_cats[i_r]) - start_dt).days
    print(sorted(utils.build_histogram(a_days)))