示例#1
0
def create_db_SURF(data_train):
    descriptors, fnames = SURF_feature_extraction(data_train)
    kmeans = clustering(np.concatenate(descriptors, axis=0))
    for des, fname in tqdm.tqdm(zip(descriptors, fnames),
                                desc='Creating database'):
        representation = build_histogram(des, kmeans)
        np.save('./database/BoW/SURF/' + fname[:-4], representation)
    joblib.dump(kmeans, './database/BoW/SURF/kmeans_trained.pkl')
示例#2
0
def most_frequent(s):
  histogram = build_histogram(s)
  
  #print("> debug: histogram is", histogram)
  res = []
  for k,v in histogram.items():
    res.append((v, k))
  #print("> debug: rest is", res)
  res.sort(reverse=True)
  #print("> debug: rest after sort", res)

  final_res = []
  for k,v in res:
    final_res.append(v) 
  return final_res
示例#3
0
def eval(feat_type, q_valid):
    maps = {}
    db_embeddings, mAP, time_running = [], [], []
    for idx, fname in enumerate(os.listdir('./database/BoW/%s/' % feat_type)):
        if fname.endswith('.npy'):
            embedding = torch.FloatTensor(
                np.load('./database/BoW/%s/' % (feat_type) + fname))
            db_embeddings.append(embedding)
            maps[idx] = fname[:-4] + '.jpg'
    db_embeddings = torch.stack(db_embeddings, dim=0).squeeze(dim=0)
    cs_func = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

    kmeans = joblib.load('./database/BoW/%s/kmeans_trained.pkl' % feat_type)

    for q_name, attribute in tqdm.tqdm(q_valid.get_queries().items(),
                                       desc='Evaluting'):
        bbox, class_idx = attribute[0], attribute[1]
        start = time.time()
        query_img = cv2.imread(hp.image_dir + q_name)
        des = sift_detect_and_compute([query_img],
                                      normalize=True,
                                      keep_top_k=320)
        representation = build_histogram(des, kmeans)
        query_embedding = torch.FloatTensor(representation).unsqueeze(dim=0)
        if use_gpu:
            query_embedding = query_embedding.cuda()
            db_embeddings = db_embeddings.cuda()
            cs_func = cs_func.cuda()
        similarity = cs_func(query_embedding, db_embeddings).topk(
            len(q_valid.get_groundtruth()[class_idx]))
        prediction = [maps[idx] for idx in similarity[1].cpu().numpy()]
        end = time.time()
        score = similarity[0].cpu().numpy()
        AP = calculate_AP(prediction=prediction,
                          score=score,
                          groundtruth=q_valid.get_groundtruth()[class_idx])
        mAP.append(AP)
        time_running.append(end - start)

    print('mAP: %f' % (np.mean(mAP) * 100))
    print('Time running: %f secs' % np.mean(time_running))
data_schema = data_schemas.DataSchema(1)

src_genders = ds.field_by_name('gender')
src_yobs = ds.field_by_name('year_of_birth')
src_weights = ds.field_by_name('weight_kg')
src_heights = ds.field_by_name('height_cm')
src_bmis = ds.field_by_name('bmi')

# ages
age_fn = CalculateAgeFromYearOfBirth(0x1, 0x2,
                                     utils.valid_range_fac_inc(0, 90), 2020)
ages = np.zeros(len(src_yobs), dtype=np.uint32)
age_fn(src_yobs, ages, filter_status)

print(sorted(utils.build_histogram(ages)))

# gender check
print('gender:', utils.build_histogram(src_genders))

cast_weights = [-1.0 if v == '' else float(v) for v in src_weights]
bins = [
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 32, 34, 36, 38, 40, 50, 60, 70,
    80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 202, 204,
    206, 208, 210, 212, 214, 216, 218, 220, 230, 240, 250, 260, 270, 280, 290,
    300, 320, 340, 360, 380, 400, 450, 500, 600, 700, 800, 900, 1000, 10000,
    100000, 100000, 1000000, 100000000
]
cast_weights_hist = np.histogram(cast_weights, bins=bins)
for k, v in zip(cast_weights_hist[1], cast_weights_hist[0]):
    print(f'{k}: {v}')
            else:
                self.last_yes_date = max(self.last_yes_date, created_at)
        if self.latest_assessment_date is None:
            self.latest_assessment_date = created_at
        else:
            self.latest_assessment_date = max(self.latest_assessment_date,
                                              created_at)


by_patient = defaultdict(TestCount)
a_pids = a_ds.field_by_name('patient_id')
a_hcts = a_ds.field_by_name('had_covid_test')
a_tcps = a_ds.field_by_name('tested_covid_positive')
a_c_ats = a_ds.field_by_name('created_at')

print(utils.build_histogram(a_hcts))
for i_r in range(a_ds.row_count()):
    by_patient[a_pids[i_r]].add_assessment(a_hcts[i_r], a_tcps[i_r],
                                           a_c_ats[i_r])

asmt_count = 0
hct_count = 0
count_yes_only = 0
count_no_only = 0
count_yes_and_no = 0
count_yes_after_no = 0
count_no_after_yes = 0
hgram_test_counts = defaultdict(int)
for k, v in by_patient.items():
    asmt_count += v.count
    if v.had_covid_test_count > 0:
示例#6
0
def pipeline(patient_filename, assessment_filename, data_schema, parsing_schema, year, territory=None):

    categorical_maps = data_schema.assessment_categorical_maps
    # TODO: use proper logging throughout
    print(); print()
    print('load patients')
    print('=============')
    with open(patient_filename) as f:
        geoc_ds = dataset.Dataset(f, data_schema.patient_categorical_maps, progress=True)
    print("sorting patients")
    geoc_ds.sort(('id',))
    geoc_ds.show()
    print("patient row count:", geoc_ds.row_count())


    print(); print()
    print('load assessments')
    print('================')
    with open(assessment_filename) as f:
        asmt_ds = dataset.Dataset(f, data_schema.assessment_categorical_maps, progress=True)
    print('sorting assessments')
    asmt_ds.sort(('patient_id', 'updated_at'))
    asmt_ds.show()
    print("assessment row count:", asmt_ds.row_count())


    print(); print()
    print("pre-sort by patient id")
    print("======================")

    print("pre-sort patient data")
    geoc_filter_status = np.zeros(geoc_ds.row_count(), dtype=np.uint32)

    print(); print("pre-sort assessment data")
    asmt_filter_status = np.zeros(asmt_ds.row_count(), dtype=np.uint32)

    if territory is not None:
        print(); print();
        print("filter patients from outside the territory of interest")
        print("------------------------------------------------------")

        country_codes = geoc_ds.field_by_name('country_code')
        for ir, r in enumerate(country_codes):
            if r != territory:
                geoc_filter_status[ir] |= PFILTER_OTHER_TERRITORY
        print(f'other territories: filtered {count_flag_set(geoc_filter_status, PFILTER_OTHER_TERRITORY)} missing values')

    print('patients:', len(geoc_filter_status))
    # print('patients with no assessments:',
    #       count_flag_set(geoc_filter_status, PFILTER_NO_ASSESSMENTS))
    # print('patients with one assessment:',
    #       count_flag_set(geoc_filter_status, PFILTER_ONE_ASSESSMENT))
    # print('patients with sufficient assessments:', geoc_filter_status.count(0))

    print(); print()
    print("patients")
    print("--------")

    ptnt_dest_fields = dict()

    print()
    print("checking age")
    src_yobs = geoc_ds.field_by_name('year_of_birth')
    ages = np.zeros(len(src_yobs), dtype=np.uint32)
    fn = CalculateAgeFromYearOfBirth(FILTER_MISSING_AGE, FILTER_BAD_AGE,
                                     valid_range_fac_inc(MIN_AGE, MAX_AGE), year)
    fn(src_yobs, ages, geoc_filter_status)
    ptnt_dest_fields['age'] = ages
    print(f'age: filtered {count_flag_set(geoc_filter_status, FILTER_MISSING_AGE)} missing values')
    print(f'age: filtered {count_flag_set(geoc_filter_status, FILTER_BAD_AGE)} bad values')


    print()
    print('checking weight / height / bmi')
    src_genders = geoc_ds.field_by_name('gender')
    src_weights = geoc_ds.field_by_name('weight_kg')
    src_heights = geoc_ds.field_by_name('height_cm')
    src_bmis = geoc_ds.field_by_name('bmi')

    fn_fac = parsing_schema.class_entries['validate_weight_height_bmi']
    fn = fn_fac(MIN_WEIGHT, MAX_WEIGHT, MIN_HEIGHT, MAX_HEIGHT, MIN_BMI, MAX_BMI,
                FILTER_MISSING_AGE, FILTER_BAD_AGE,
                FILTER_MISSING_WEIGHT, FILTER_BAD_WEIGHT,
                FILTER_MISSING_HEIGHT, FILTER_BAD_HEIGHT,
                FILTER_MISSING_BMI, FILTER_BAD_BMI)
    weight_clean, height_clean, bmi_clean =\
        fn(src_genders, ages, src_weights, src_heights, src_bmis, geoc_filter_status)
    ptnt_dest_fields['weight_clean'] = weight_clean
    ptnt_dest_fields['height_clean'] = height_clean
    ptnt_dest_fields['bmi_clean'] = bmi_clean
    print(f'weight: filtered {count_flag_set(geoc_filter_status, FILTER_MISSING_WEIGHT)} missing_values')
    print(f'weight: filtered {count_flag_set(geoc_filter_status, FILTER_BAD_WEIGHT)} missing_values')
    print(f'height: filtered {count_flag_set(geoc_filter_status, FILTER_MISSING_HEIGHT)} missing_values')
    print(f'height: filtered {count_flag_set(geoc_filter_status, FILTER_BAD_HEIGHT)} missing_values')
    print(f'bmi: filtered {count_flag_set(geoc_filter_status, FILTER_MISSING_BMI)} missing_values')
    print(f'bmi: filtered {count_flag_set(geoc_filter_status, FILTER_BAD_BMI)} missing_values')

    print(); print('unfiltered patients:', count_flag_empty(geoc_filter_status))


    print(); print()
    print("assessments")
    print("-----------")

    asmt_dest_fields = dict()
    asmt_dest_keys = dict()

    patient_ids = set()
    src_patient_ids = geoc_ds.field_by_name('id')
    for ir, r in enumerate(src_patient_ids):
        if geoc_filter_status[ir] == 0:
            patient_ids.add(r)
    src_asmt_patient_ids = asmt_ds.field_by_name('patient_id')
    for ir, r in enumerate(src_asmt_patient_ids):
        if r not in patient_ids:
            asmt_filter_status[ir] |= AFILTER_PATIENT_FILTERED

    print('assessments filtered due to patient filtering:',
          count_flag_set(asmt_filter_status, AFILTER_PATIENT_FILTERED))
    print('assessments filtered total:',
          count_flag_set(asmt_filter_status, FILTERA_ALL))

    print(); print("checking temperature")
    fn_fac = parsing_schema.class_entries['validate_temperature']
    fn = fn_fac(MIN_TEMP, MAX_TEMP, FILTER_MISSING_TEMP, FILTER_BAD_TEMP)
    temperature_c = fn(asmt_ds.field_by_name('temperature'), asmt_filter_status)
    asmt_dest_fields['temperature_C'] = temperature_c
    print(f'temperature: filtered {count_flag_set(asmt_filter_status, FILTER_BAD_TEMP)} bad values')

    print(); print("checking inconsistent test / test results fields")
    src_had_test = asmt_ds.field_by_name('had_covid_test')
    src_tested_covid_positive = asmt_ds.field_by_name('tested_covid_positive')
    fn = CheckTestingConsistency(FILTER_INCONSISTENT_NOT_TESTED, FILTER_INCONSISTENT_TESTED)
    fn(src_had_test, src_tested_covid_positive, asmt_filter_status)
    print(f'inconsistent_not_tested: filtered {count_flag_set(asmt_filter_status, FILTER_INCONSISTENT_NOT_TESTED)} missing values')
    print(f'inconsistent_tested: filtered {count_flag_set(asmt_filter_status, FILTER_INCONSISTENT_TESTED)} missing values')


    print(); print('unfiltered assessments:', np.count_nonzero(asmt_filter_status == 0))


    print(); print()
    print("convert symptomatic, exposure, flattened and miscellaneous fields to bool")
    print("-------------------------------------------------------------------------")

    any_symptoms = np.zeros(asmt_ds.row_count(), dtype=np.bool)
    for s in symptomatic_fields:
        print(f"symptomatic_field '{s}' to categorical")

        asmt_dest_fields[s] = copy_field(asmt_ds.field_by_name(s))
        any_symptoms |= asmt_dest_fields[s] > 1
        print(np.count_nonzero(asmt_dest_fields[s] == True))
        print(np.count_nonzero(any_symptoms == True))

    print(build_histogram(asmt_ds.field_by_name('tested_covid_positive')))

    for f in flattened_fields:
        print(f"flattened_field '{f[0]}' to categorical field '{f[1]}'")
        remap = map_between_categories(categorical_maps[f[0]].strings_to_values,
                                       categorical_maps[f[1]].strings_to_values)
        asmt_dest_fields[f[1]] =\
            to_categorical(asmt_ds.field_by_name(f[0]), remap)
        # TODO: this shouldn't be necessary as the fields were covered in 'symptomatic_fields'
        any_symptoms |= asmt_dest_fields[f[1]] > 1

    for e in exposure_fields:
        print(f"exposure_field '{e}' to categorical")
        asmt_dest_fields[e] = copy_field(asmt_ds.field_by_name(e))
    for m in miscellaneous_fields:
        print(f"miscellaneous_field '{m}' to categorical")
        asmt_dest_fields[m] = copy_field(asmt_ds.field_by_name(m))


    print(); print()
    print("validate health status with symptoms")
    print("---------------------------------")
    fn = CheckInconsistentSymptoms(FILTER_HEALTHY_BUT_SYMPTOMS, FILTER_NOT_HEALTHY_BUT_NO_SYMPTOMS)
    # TODO: keys should be got from the dataset once it is loaded rather than referring to the categorical maps directly
    fn(asmt_dest_fields['health_status'], any_symptoms, asmt_filter_status,
       categorical_maps['health_status'].strings_to_values['healthy'],
       categorical_maps['health_status'].strings_to_values['not_healthy'])
    for f in (FILTER_HEALTHY_BUT_SYMPTOMS, FILTER_NOT_HEALTHY_BUT_NO_SYMPTOMS):
        print(f'{assessment_flag_descs[f]}: {count_flag_set(asmt_filter_status, f)}')

    print(); print('unfiltered assessments:', np.count_nonzero(asmt_filter_status == 0))


    # validate assessments per patient
    print(); print()
    print("validate covid progression")
    print("--------------------------")
    sanitised_hct_covid_results = np.ndarray(asmt_ds.row_count(), dtype=np.uint8)
    sanitised_covid_results = np.ndarray(asmt_ds.row_count(), dtype=np.uint8)
    sanitised_covid_results_key = categorical_maps['tested_covid_positive'].values_to_strings[:]

    fn_fac = parsing_schema.class_entries['clean_covid_progression']
    fn = fn_fac(asmt_ds.field_by_name('had_covid_test'), asmt_ds.field_by_name('tested_covid_positive'),
                asmt_filter_status,
                sanitised_hct_covid_results, sanitised_covid_results,
                FILTER_INVALID_COVID_PROGRESSION)
    iterate_over_patient_assessments2(
        asmt_ds.field_by_name('patient_id'), asmt_filter_status, fn)

    print(f'{assessment_flag_descs[FILTER_INVALID_COVID_PROGRESSION]}:',
          count_flag_set(asmt_filter_status, FILTER_INVALID_COVID_PROGRESSION))

    asmt_dest_fields['tested_covid_positive_clean'] = sanitised_covid_results
    asmt_dest_keys['tested_covid_positive_clean'] = sanitised_covid_results_key
    asmt_dest_fields['had_covid_test_clean'] = sanitised_hct_covid_results
    asmt_dest_keys['had_covid_test_clean'] = sanitised_covid_results_key

    print('remaining assessments before squashing', np.count_nonzero(asmt_filter_status == 0))

    # create a new assessment space with only unfiltered rows
    print(); print()
    print("discard all filtered assessments")
    print("--------------------------------")
    remaining_asmt_fields = list()
    remaining_dest_fields = dict()

    filter_map = list()
    for ir, r in enumerate(asmt_filter_status):
        if r == 0:
            filter_map.append(ir)

    for ir, r in enumerate(asmt_ds.fields_):
        remaining_asmt_fields.append(filtered_field.FilteredField(r, filter_map))

    for k, v in asmt_dest_fields.items():
        remaining_dest_fields[k] = filtered_field.FilteredField(v, filter_map)

    print("remaining asmt fields: ", len(filter_map))
    remaining_asmt_filter_status = [0] * len(filter_map)

    print(); print()
    print("quantise assessments by day")
    print("---------------------------")

    fn = CalculateMergedFieldCount(remaining_asmt_fields[asmt_ds.field_to_index('updated_at')])
    print(len(filter_map))
    print(len(remaining_asmt_filter_status))
    remaining_patient_ids = remaining_asmt_fields[asmt_ds.field_to_index('patient_id')]
    iterate_over_patient_assessments2(remaining_patient_ids, remaining_asmt_filter_status, fn)
    remaining_asmt_row_count = len(filter_map) - fn.merged_row_count
    print(f'{len(filter_map)} - {fn.merged_row_count} = {remaining_asmt_row_count}')

    existing_field_indices = [(f, asmt_ds.field_to_index(f)) for f in existing_fields]

    resulting_fields = dict()
    for e in existing_fields:
        resulting_fields[e] = [None] * remaining_asmt_row_count
    for dk, dv in remaining_dest_fields.items():
        resulting_fields[dk] = np.zeros((remaining_asmt_row_count, ), dtype=dv.dtype)

    resulting_field_keys = dict()
    for dk, dv in asmt_dest_keys.items():
        resulting_field_keys[dk] = dv

    print('remaining_dest_len:', len(remaining_dest_fields['fatigue_binary']))
    print('resulting_fields:', len(resulting_fields['patient_id']))

    print(build_histogram(remaining_dest_fields['tested_covid_positive_clean']))
    concat_field_indices =\
        [asmt_ds.field_to_index('other_symptoms'), asmt_ds.field_to_index('treatment')]
    merge = MergeAssessmentRows(concat_field_indices,
                                resulting_fields, remaining_dest_fields,
                                existing_field_indices, custom_field_aggregators)
    iterate_over_patient_assessments(remaining_asmt_fields, remaining_asmt_filter_status, merge)
    print(merge.rfindex)

    unique_patients = defaultdict(int)
    for r in remaining_patient_ids:
        unique_patients[r] += 1
    print('unique patents in remaining assessments:', len(unique_patients))

    print(); print()
    print("filter summaries")
    print("----------------")

    print(); print('patient flags set')
    for v in patient_flag_descs.keys():
        print(f'{patient_flag_descs[v]}: {count_flag_set(geoc_filter_status, v)}')

    print(); print('assessment flags set')
    for v in assessment_flag_descs.keys():
        print(f'{assessment_flag_descs[v]}: {count_flag_set(asmt_filter_status, v)}')

    return (geoc_ds, geoc_filter_status, ptnt_dest_fields,
            asmt_ds, asmt_filter_status,
            remaining_asmt_fields, remaining_asmt_filter_status,
            resulting_fields, resulting_field_keys)
示例#7
0
def split_data(patient_data, assessment_data, bucket_size=500000):

    with open(patient_data) as f:
        p_ds = dataset.Dataset(f, keys=('id', 'created_at'),
                               progress=True)
                               # progress=True, stop_after=500000)
        p_ds.sort(('created_at', 'id'))
        p_ids = p_ds.field_by_name('id')
        p_dts = p_ds.field_by_name('created_at')

    # put assessment ids into buckets
    buckets = dict()
    bucket_index = 0
    bucket_count = 0
    for i_r in range(p_ds.row_count()):
        if bucket_index == bucket_size:
            bucket_index = 0
            bucket_count += 1
        buckets[p_ids[i_r]] = bucket_count
        bucket_index += 1

    filenames = list()
    for b in range(bucket_count+1):
        destination_filename = patient_data[:-4] + f"_{b:04d}" + ".csv"
        filenames.append(destination_filename)
    print(filenames)
    sorted_indices = p_ds.index_
    del p_ds

    patient_splitter(patient_data, filenames, sorted_indices, bucket_size)

    print('buckets:', bucket_index)
    with open(assessment_data) as f:
        a_ds = dataset.Dataset(f, keys=('patient_id', 'other_symptoms'), progress=True)

    print(utils.build_histogram(buckets.values()))

    print('associating assessments with patients')
    orphaned_assessments = 0
    a_buckets = list()
    a_pids = a_ds.field_by_name('patient_id')
    a_os = a_ds.field_by_name('other_symptoms')
    for i_r in range(a_ds.row_count()):
        if a_pids[i_r] in buckets:
            a_buckets.append(buckets[a_pids[i_r]])
        else:
            orphaned_assessments += 1
            a_buckets.append(-1)

    del a_ds
    print('orphaned_assessments:', orphaned_assessments)

    print(f'{bucket_count + 1} buckets')
    for i in range(bucket_count + 1):
        print('bucket', i)
        destination_filename = assessment_data[:-4] + f"_{i:04d}" + ".csv"
        print(destination_filename)
        # with open(assessment_data) as f:
        #     a_ds = dataset.Dataset(f, filter_fn=lambda j: a_buckets[j] == i, progress=True)
        #
        # del a_ds
        assessment_splitter(assessment_data, destination_filename, a_buckets, i)

    print('done!')
示例#8
0
        self.u = max(self.u, day)
        self.c += 1


assessment_filter_count = 0
for i_p, p in enumerate(a_pids):
    if p in filtered_patients:
        assessment_filter_count += 1
        day = utils.timestamp_to_day(a_updateds[i_p])
        if filtered_patients[p] is None:
            filtered_patients[p] = AsmtEntry(day)
        else:
            filtered_patients[p].add(day)

updated_ats = [v.u for v in filtered_patients.values() if v is not None]
asmt_counts = [v.c for v in filtered_patients.values() if v is not None]
# print(updated_ats)
h_updated_ats = sorted(utils.build_histogram(updated_ats))
print(h_updated_ats)
sumv = 0
for h in h_updated_ats:
    sumv += h[1]
    print(h[0], 31310 - sumv)
print(sumv)

h_asmt_counts = sorted(utils.build_histogram(asmt_counts))
# print(h_asmt_counts)
print(assessment_filter_count - sumv)
for h in h_asmt_counts:
    print(h[0], h[1])
print('assessments filtered by patient_filtering:', assessment_filter_count)
示例#9
0
# Copyright 2020 KCL-BMEIS - King's College London
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataset
import pipeline
import utils

patients_filename = '/home/ben/covid/patients_export_geocodes_20200423050002.csv'
assessments_filename = '/home/ben/covid/assessments_export_20200423050002.csv'
#fn = '/home/ben/covid/assessments_short.csv'
print(f'loading {patients_filename}')
with open(patients_filename) as f:
    ds = dataset.Dataset(f, progress=True)

print(utils.build_histogram(ds.field_by_name('version')))

print(f'loading {assessments_filename}')
with open(assessments_filename) as f:
    ds = dataset.Dataset(f, progress=True)

print(utils.build_histogram(ds.field_by_name('version')))
示例#10
0
# =============================================================
import random
import utils


def choose_from_hist(histogram):
    swap_map = utils.swap_counter_and_value_histogram(histogram)
    counter_list = list(swap_map.keys())
    max_counter = max(counter_list)
    min_counter = min(counter_list)

    res = None
    rand = random.randint(min_counter, max_counter)
    while not res:
        res = swap_map[rand]
        rand = random.randint(min_counter, max_counter)

    return (res, rand, utils.sumall(counter_list))


if __name__ == "__main__":
    t = ['a', 'a', 'b']
    histogram = utils.build_histogram(t)
    res = choose_from_hist(histogram)
    print("The choosen one is {} with probability of {}/{}".format(
        res[0], res[1], res[2]))

    # OR using built-in function
    rand_word = random.choice(list(histogram.keys()))
    print("Random word using built-in function:", rand_word)
     'past_symptom_anosmia', 'past_symptom_shortness_of_breath',
     'past_symptom_fatigue', 'past_symptom_fever',
     'past_symptom_skipped_meals', 'past_symptom_persistent_cough',
     'past_symptom_diarrhoea', 'past_symptom_chest_pain',
     'past_symptom_hoarse_voice', 'past_symptom_abdominal_pain',
     'past_symptom_delirium')

with open(patient_filename) as f:
    ds = dataset.Dataset(f, keys=core_keys + past_symptom_keys, progress=True)
    # progress=True, stop_after=999999)
    ds.sort(('created_at', 'id'))

for p in past_symptom_keys:
    field = ds.field_by_name(p)
    if p == 'past_symptoms_days_ago':
        histogram = build_histogram(field)
        nones = None
        for h in histogram:
            if h[0] is '':
                nones = h[1]
        histogram = [(int(v[0]), v[1]) for v in build_histogram(field)
                     if v[0] is not '']
        if nones is not None:
            histogram = [(None, nones)] + sorted(histogram)
        else:
            histogram = sorted(histogram)
        print(f"{p}:", histogram)
    else:
        print(f"{p}:", sorted(build_histogram(field)))

p_ids = ds.field_by_name('id')
def check_other_symptoms(input_filename):
    with open(input_filename) as f:
        ds = dataset.Dataset(f,
                             keys=('patient_id', 'updated_at',
                                   'other_symptoms', 'treatment', 'location'),
                             progress=True)
        # progress=True, stop_after=2999999)

    by_patient = defaultdict(Locations)
    p_id = ds.field_by_name('patient_id')
    other = ds.field_by_name('other_symptoms')
    treatment = ds.field_by_name('treatment')
    location = ds.field_by_name('location')

    word_dict = defaultdict(int)
    _2ple_dict = defaultdict(int)
    _3ple_dict = defaultdict(int)
    other_symptoms_empty = 0
    other_treatment_empty = 0
    table = str.maketrans('', '', string.punctuation)
    for i_r in range(len(other)):
        by_patient[p_id[i_r]].add(location[i_r])
        if other[i_r] == '':
            other_symptoms_empty += 1
        else:
            # split and clean words, then add to dictionary
            words = other[i_r].split()
            cwords = [(w.lower()).translate(table) for w in words]
            cwords = [w for w in cwords if w != '']
            for c in cwords:
                word_dict[c] += 1
            for i_c in range(len(cwords) - 1):
                _2ple_dict[(cwords[i_c], cwords[i_c + 1])] += 1
            for i_c in range(len(cwords) - 2):
                _3ple_dict[(cwords[i_c], cwords[i_c + 1],
                            cwords[i_c + 2])] += 1
    for i_r in range(len(treatment)):
        if treatment[i_r] == '':
            other_treatment_empty += 1

    by_patient_values = [x for x in by_patient.values()]
    by_patient_hist = utils.build_histogram([(v.hosp, v.bfhosp)
                                             for v in by_patient_values])
    by_patient_hist = sorted(by_patient_hist, reverse=True)
    print(by_patient_hist)

    print(utils.build_histogram(location))
    print('other_symptoms - non-empty', ds.row_count() - other_symptoms_empty)
    print('other_treatment - non-empty',
          ds.row_count() - other_treatment_empty)
    by_max_freq = sorted([w for w in word_dict.items()],
                         key=lambda x: (-x[1], x[0]))
    by_max_freq_2ple =\
        sorted([w for w in _2ple_dict.items()], key=lambda x: (-x[1], x[0]))
    by_max_freq_3ple =\
        sorted([w for w in _3ple_dict.items()], key=lambda x: (-x[1], x[0]))

    threshold = 100
    for w in by_max_freq:
        if w[1] < threshold:
            break
        print(w[0], w[1])

    for w in by_max_freq_2ple:
        if w[1] < threshold:
            break
        print(w[0], w[1])

    for w in by_max_freq_3ple:
        if w[1] < threshold:
            break
        print(w[0], w[1])

    for i_r in range(len(treatment)):
        if ',' in treatment[i_r]:
            print(i_r, treatment[i_r])