def create_db_SURF(data_train): descriptors, fnames = SURF_feature_extraction(data_train) kmeans = clustering(np.concatenate(descriptors, axis=0)) for des, fname in tqdm.tqdm(zip(descriptors, fnames), desc='Creating database'): representation = build_histogram(des, kmeans) np.save('./database/BoW/SURF/' + fname[:-4], representation) joblib.dump(kmeans, './database/BoW/SURF/kmeans_trained.pkl')
def most_frequent(s): histogram = build_histogram(s) #print("> debug: histogram is", histogram) res = [] for k,v in histogram.items(): res.append((v, k)) #print("> debug: rest is", res) res.sort(reverse=True) #print("> debug: rest after sort", res) final_res = [] for k,v in res: final_res.append(v) return final_res
def eval(feat_type, q_valid): maps = {} db_embeddings, mAP, time_running = [], [], [] for idx, fname in enumerate(os.listdir('./database/BoW/%s/' % feat_type)): if fname.endswith('.npy'): embedding = torch.FloatTensor( np.load('./database/BoW/%s/' % (feat_type) + fname)) db_embeddings.append(embedding) maps[idx] = fname[:-4] + '.jpg' db_embeddings = torch.stack(db_embeddings, dim=0).squeeze(dim=0) cs_func = torch.nn.CosineSimilarity(dim=1, eps=1e-6) kmeans = joblib.load('./database/BoW/%s/kmeans_trained.pkl' % feat_type) for q_name, attribute in tqdm.tqdm(q_valid.get_queries().items(), desc='Evaluting'): bbox, class_idx = attribute[0], attribute[1] start = time.time() query_img = cv2.imread(hp.image_dir + q_name) des = sift_detect_and_compute([query_img], normalize=True, keep_top_k=320) representation = build_histogram(des, kmeans) query_embedding = torch.FloatTensor(representation).unsqueeze(dim=0) if use_gpu: query_embedding = query_embedding.cuda() db_embeddings = db_embeddings.cuda() cs_func = cs_func.cuda() similarity = cs_func(query_embedding, db_embeddings).topk( len(q_valid.get_groundtruth()[class_idx])) prediction = [maps[idx] for idx in similarity[1].cpu().numpy()] end = time.time() score = similarity[0].cpu().numpy() AP = calculate_AP(prediction=prediction, score=score, groundtruth=q_valid.get_groundtruth()[class_idx]) mAP.append(AP) time_running.append(end - start) print('mAP: %f' % (np.mean(mAP) * 100)) print('Time running: %f secs' % np.mean(time_running))
data_schema = data_schemas.DataSchema(1) src_genders = ds.field_by_name('gender') src_yobs = ds.field_by_name('year_of_birth') src_weights = ds.field_by_name('weight_kg') src_heights = ds.field_by_name('height_cm') src_bmis = ds.field_by_name('bmi') # ages age_fn = CalculateAgeFromYearOfBirth(0x1, 0x2, utils.valid_range_fac_inc(0, 90), 2020) ages = np.zeros(len(src_yobs), dtype=np.uint32) age_fn(src_yobs, ages, filter_status) print(sorted(utils.build_histogram(ages))) # gender check print('gender:', utils.build_histogram(src_genders)) cast_weights = [-1.0 if v == '' else float(v) for v in src_weights] bins = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 32, 34, 36, 38, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 230, 240, 250, 260, 270, 280, 290, 300, 320, 340, 360, 380, 400, 450, 500, 600, 700, 800, 900, 1000, 10000, 100000, 100000, 1000000, 100000000 ] cast_weights_hist = np.histogram(cast_weights, bins=bins) for k, v in zip(cast_weights_hist[1], cast_weights_hist[0]): print(f'{k}: {v}')
else: self.last_yes_date = max(self.last_yes_date, created_at) if self.latest_assessment_date is None: self.latest_assessment_date = created_at else: self.latest_assessment_date = max(self.latest_assessment_date, created_at) by_patient = defaultdict(TestCount) a_pids = a_ds.field_by_name('patient_id') a_hcts = a_ds.field_by_name('had_covid_test') a_tcps = a_ds.field_by_name('tested_covid_positive') a_c_ats = a_ds.field_by_name('created_at') print(utils.build_histogram(a_hcts)) for i_r in range(a_ds.row_count()): by_patient[a_pids[i_r]].add_assessment(a_hcts[i_r], a_tcps[i_r], a_c_ats[i_r]) asmt_count = 0 hct_count = 0 count_yes_only = 0 count_no_only = 0 count_yes_and_no = 0 count_yes_after_no = 0 count_no_after_yes = 0 hgram_test_counts = defaultdict(int) for k, v in by_patient.items(): asmt_count += v.count if v.had_covid_test_count > 0:
def pipeline(patient_filename, assessment_filename, data_schema, parsing_schema, year, territory=None): categorical_maps = data_schema.assessment_categorical_maps # TODO: use proper logging throughout print(); print() print('load patients') print('=============') with open(patient_filename) as f: geoc_ds = dataset.Dataset(f, data_schema.patient_categorical_maps, progress=True) print("sorting patients") geoc_ds.sort(('id',)) geoc_ds.show() print("patient row count:", geoc_ds.row_count()) print(); print() print('load assessments') print('================') with open(assessment_filename) as f: asmt_ds = dataset.Dataset(f, data_schema.assessment_categorical_maps, progress=True) print('sorting assessments') asmt_ds.sort(('patient_id', 'updated_at')) asmt_ds.show() print("assessment row count:", asmt_ds.row_count()) print(); print() print("pre-sort by patient id") print("======================") print("pre-sort patient data") geoc_filter_status = np.zeros(geoc_ds.row_count(), dtype=np.uint32) print(); print("pre-sort assessment data") asmt_filter_status = np.zeros(asmt_ds.row_count(), dtype=np.uint32) if territory is not None: print(); print(); print("filter patients from outside the territory of interest") print("------------------------------------------------------") country_codes = geoc_ds.field_by_name('country_code') for ir, r in enumerate(country_codes): if r != territory: geoc_filter_status[ir] |= PFILTER_OTHER_TERRITORY print(f'other territories: filtered {count_flag_set(geoc_filter_status, PFILTER_OTHER_TERRITORY)} missing values') print('patients:', len(geoc_filter_status)) # print('patients with no assessments:', # count_flag_set(geoc_filter_status, PFILTER_NO_ASSESSMENTS)) # print('patients with one assessment:', # count_flag_set(geoc_filter_status, PFILTER_ONE_ASSESSMENT)) # print('patients with sufficient assessments:', geoc_filter_status.count(0)) print(); print() print("patients") print("--------") ptnt_dest_fields = dict() print() print("checking age") src_yobs = geoc_ds.field_by_name('year_of_birth') ages = np.zeros(len(src_yobs), dtype=np.uint32) fn = CalculateAgeFromYearOfBirth(FILTER_MISSING_AGE, FILTER_BAD_AGE, valid_range_fac_inc(MIN_AGE, MAX_AGE), year) fn(src_yobs, ages, geoc_filter_status) ptnt_dest_fields['age'] = ages print(f'age: filtered {count_flag_set(geoc_filter_status, FILTER_MISSING_AGE)} missing values') print(f'age: filtered {count_flag_set(geoc_filter_status, FILTER_BAD_AGE)} bad values') print() print('checking weight / height / bmi') src_genders = geoc_ds.field_by_name('gender') src_weights = geoc_ds.field_by_name('weight_kg') src_heights = geoc_ds.field_by_name('height_cm') src_bmis = geoc_ds.field_by_name('bmi') fn_fac = parsing_schema.class_entries['validate_weight_height_bmi'] fn = fn_fac(MIN_WEIGHT, MAX_WEIGHT, MIN_HEIGHT, MAX_HEIGHT, MIN_BMI, MAX_BMI, FILTER_MISSING_AGE, FILTER_BAD_AGE, FILTER_MISSING_WEIGHT, FILTER_BAD_WEIGHT, FILTER_MISSING_HEIGHT, FILTER_BAD_HEIGHT, FILTER_MISSING_BMI, FILTER_BAD_BMI) weight_clean, height_clean, bmi_clean =\ fn(src_genders, ages, src_weights, src_heights, src_bmis, geoc_filter_status) ptnt_dest_fields['weight_clean'] = weight_clean ptnt_dest_fields['height_clean'] = height_clean ptnt_dest_fields['bmi_clean'] = bmi_clean print(f'weight: filtered {count_flag_set(geoc_filter_status, FILTER_MISSING_WEIGHT)} missing_values') print(f'weight: filtered {count_flag_set(geoc_filter_status, FILTER_BAD_WEIGHT)} missing_values') print(f'height: filtered {count_flag_set(geoc_filter_status, FILTER_MISSING_HEIGHT)} missing_values') print(f'height: filtered {count_flag_set(geoc_filter_status, FILTER_BAD_HEIGHT)} missing_values') print(f'bmi: filtered {count_flag_set(geoc_filter_status, FILTER_MISSING_BMI)} missing_values') print(f'bmi: filtered {count_flag_set(geoc_filter_status, FILTER_BAD_BMI)} missing_values') print(); print('unfiltered patients:', count_flag_empty(geoc_filter_status)) print(); print() print("assessments") print("-----------") asmt_dest_fields = dict() asmt_dest_keys = dict() patient_ids = set() src_patient_ids = geoc_ds.field_by_name('id') for ir, r in enumerate(src_patient_ids): if geoc_filter_status[ir] == 0: patient_ids.add(r) src_asmt_patient_ids = asmt_ds.field_by_name('patient_id') for ir, r in enumerate(src_asmt_patient_ids): if r not in patient_ids: asmt_filter_status[ir] |= AFILTER_PATIENT_FILTERED print('assessments filtered due to patient filtering:', count_flag_set(asmt_filter_status, AFILTER_PATIENT_FILTERED)) print('assessments filtered total:', count_flag_set(asmt_filter_status, FILTERA_ALL)) print(); print("checking temperature") fn_fac = parsing_schema.class_entries['validate_temperature'] fn = fn_fac(MIN_TEMP, MAX_TEMP, FILTER_MISSING_TEMP, FILTER_BAD_TEMP) temperature_c = fn(asmt_ds.field_by_name('temperature'), asmt_filter_status) asmt_dest_fields['temperature_C'] = temperature_c print(f'temperature: filtered {count_flag_set(asmt_filter_status, FILTER_BAD_TEMP)} bad values') print(); print("checking inconsistent test / test results fields") src_had_test = asmt_ds.field_by_name('had_covid_test') src_tested_covid_positive = asmt_ds.field_by_name('tested_covid_positive') fn = CheckTestingConsistency(FILTER_INCONSISTENT_NOT_TESTED, FILTER_INCONSISTENT_TESTED) fn(src_had_test, src_tested_covid_positive, asmt_filter_status) print(f'inconsistent_not_tested: filtered {count_flag_set(asmt_filter_status, FILTER_INCONSISTENT_NOT_TESTED)} missing values') print(f'inconsistent_tested: filtered {count_flag_set(asmt_filter_status, FILTER_INCONSISTENT_TESTED)} missing values') print(); print('unfiltered assessments:', np.count_nonzero(asmt_filter_status == 0)) print(); print() print("convert symptomatic, exposure, flattened and miscellaneous fields to bool") print("-------------------------------------------------------------------------") any_symptoms = np.zeros(asmt_ds.row_count(), dtype=np.bool) for s in symptomatic_fields: print(f"symptomatic_field '{s}' to categorical") asmt_dest_fields[s] = copy_field(asmt_ds.field_by_name(s)) any_symptoms |= asmt_dest_fields[s] > 1 print(np.count_nonzero(asmt_dest_fields[s] == True)) print(np.count_nonzero(any_symptoms == True)) print(build_histogram(asmt_ds.field_by_name('tested_covid_positive'))) for f in flattened_fields: print(f"flattened_field '{f[0]}' to categorical field '{f[1]}'") remap = map_between_categories(categorical_maps[f[0]].strings_to_values, categorical_maps[f[1]].strings_to_values) asmt_dest_fields[f[1]] =\ to_categorical(asmt_ds.field_by_name(f[0]), remap) # TODO: this shouldn't be necessary as the fields were covered in 'symptomatic_fields' any_symptoms |= asmt_dest_fields[f[1]] > 1 for e in exposure_fields: print(f"exposure_field '{e}' to categorical") asmt_dest_fields[e] = copy_field(asmt_ds.field_by_name(e)) for m in miscellaneous_fields: print(f"miscellaneous_field '{m}' to categorical") asmt_dest_fields[m] = copy_field(asmt_ds.field_by_name(m)) print(); print() print("validate health status with symptoms") print("---------------------------------") fn = CheckInconsistentSymptoms(FILTER_HEALTHY_BUT_SYMPTOMS, FILTER_NOT_HEALTHY_BUT_NO_SYMPTOMS) # TODO: keys should be got from the dataset once it is loaded rather than referring to the categorical maps directly fn(asmt_dest_fields['health_status'], any_symptoms, asmt_filter_status, categorical_maps['health_status'].strings_to_values['healthy'], categorical_maps['health_status'].strings_to_values['not_healthy']) for f in (FILTER_HEALTHY_BUT_SYMPTOMS, FILTER_NOT_HEALTHY_BUT_NO_SYMPTOMS): print(f'{assessment_flag_descs[f]}: {count_flag_set(asmt_filter_status, f)}') print(); print('unfiltered assessments:', np.count_nonzero(asmt_filter_status == 0)) # validate assessments per patient print(); print() print("validate covid progression") print("--------------------------") sanitised_hct_covid_results = np.ndarray(asmt_ds.row_count(), dtype=np.uint8) sanitised_covid_results = np.ndarray(asmt_ds.row_count(), dtype=np.uint8) sanitised_covid_results_key = categorical_maps['tested_covid_positive'].values_to_strings[:] fn_fac = parsing_schema.class_entries['clean_covid_progression'] fn = fn_fac(asmt_ds.field_by_name('had_covid_test'), asmt_ds.field_by_name('tested_covid_positive'), asmt_filter_status, sanitised_hct_covid_results, sanitised_covid_results, FILTER_INVALID_COVID_PROGRESSION) iterate_over_patient_assessments2( asmt_ds.field_by_name('patient_id'), asmt_filter_status, fn) print(f'{assessment_flag_descs[FILTER_INVALID_COVID_PROGRESSION]}:', count_flag_set(asmt_filter_status, FILTER_INVALID_COVID_PROGRESSION)) asmt_dest_fields['tested_covid_positive_clean'] = sanitised_covid_results asmt_dest_keys['tested_covid_positive_clean'] = sanitised_covid_results_key asmt_dest_fields['had_covid_test_clean'] = sanitised_hct_covid_results asmt_dest_keys['had_covid_test_clean'] = sanitised_covid_results_key print('remaining assessments before squashing', np.count_nonzero(asmt_filter_status == 0)) # create a new assessment space with only unfiltered rows print(); print() print("discard all filtered assessments") print("--------------------------------") remaining_asmt_fields = list() remaining_dest_fields = dict() filter_map = list() for ir, r in enumerate(asmt_filter_status): if r == 0: filter_map.append(ir) for ir, r in enumerate(asmt_ds.fields_): remaining_asmt_fields.append(filtered_field.FilteredField(r, filter_map)) for k, v in asmt_dest_fields.items(): remaining_dest_fields[k] = filtered_field.FilteredField(v, filter_map) print("remaining asmt fields: ", len(filter_map)) remaining_asmt_filter_status = [0] * len(filter_map) print(); print() print("quantise assessments by day") print("---------------------------") fn = CalculateMergedFieldCount(remaining_asmt_fields[asmt_ds.field_to_index('updated_at')]) print(len(filter_map)) print(len(remaining_asmt_filter_status)) remaining_patient_ids = remaining_asmt_fields[asmt_ds.field_to_index('patient_id')] iterate_over_patient_assessments2(remaining_patient_ids, remaining_asmt_filter_status, fn) remaining_asmt_row_count = len(filter_map) - fn.merged_row_count print(f'{len(filter_map)} - {fn.merged_row_count} = {remaining_asmt_row_count}') existing_field_indices = [(f, asmt_ds.field_to_index(f)) for f in existing_fields] resulting_fields = dict() for e in existing_fields: resulting_fields[e] = [None] * remaining_asmt_row_count for dk, dv in remaining_dest_fields.items(): resulting_fields[dk] = np.zeros((remaining_asmt_row_count, ), dtype=dv.dtype) resulting_field_keys = dict() for dk, dv in asmt_dest_keys.items(): resulting_field_keys[dk] = dv print('remaining_dest_len:', len(remaining_dest_fields['fatigue_binary'])) print('resulting_fields:', len(resulting_fields['patient_id'])) print(build_histogram(remaining_dest_fields['tested_covid_positive_clean'])) concat_field_indices =\ [asmt_ds.field_to_index('other_symptoms'), asmt_ds.field_to_index('treatment')] merge = MergeAssessmentRows(concat_field_indices, resulting_fields, remaining_dest_fields, existing_field_indices, custom_field_aggregators) iterate_over_patient_assessments(remaining_asmt_fields, remaining_asmt_filter_status, merge) print(merge.rfindex) unique_patients = defaultdict(int) for r in remaining_patient_ids: unique_patients[r] += 1 print('unique patents in remaining assessments:', len(unique_patients)) print(); print() print("filter summaries") print("----------------") print(); print('patient flags set') for v in patient_flag_descs.keys(): print(f'{patient_flag_descs[v]}: {count_flag_set(geoc_filter_status, v)}') print(); print('assessment flags set') for v in assessment_flag_descs.keys(): print(f'{assessment_flag_descs[v]}: {count_flag_set(asmt_filter_status, v)}') return (geoc_ds, geoc_filter_status, ptnt_dest_fields, asmt_ds, asmt_filter_status, remaining_asmt_fields, remaining_asmt_filter_status, resulting_fields, resulting_field_keys)
def split_data(patient_data, assessment_data, bucket_size=500000): with open(patient_data) as f: p_ds = dataset.Dataset(f, keys=('id', 'created_at'), progress=True) # progress=True, stop_after=500000) p_ds.sort(('created_at', 'id')) p_ids = p_ds.field_by_name('id') p_dts = p_ds.field_by_name('created_at') # put assessment ids into buckets buckets = dict() bucket_index = 0 bucket_count = 0 for i_r in range(p_ds.row_count()): if bucket_index == bucket_size: bucket_index = 0 bucket_count += 1 buckets[p_ids[i_r]] = bucket_count bucket_index += 1 filenames = list() for b in range(bucket_count+1): destination_filename = patient_data[:-4] + f"_{b:04d}" + ".csv" filenames.append(destination_filename) print(filenames) sorted_indices = p_ds.index_ del p_ds patient_splitter(patient_data, filenames, sorted_indices, bucket_size) print('buckets:', bucket_index) with open(assessment_data) as f: a_ds = dataset.Dataset(f, keys=('patient_id', 'other_symptoms'), progress=True) print(utils.build_histogram(buckets.values())) print('associating assessments with patients') orphaned_assessments = 0 a_buckets = list() a_pids = a_ds.field_by_name('patient_id') a_os = a_ds.field_by_name('other_symptoms') for i_r in range(a_ds.row_count()): if a_pids[i_r] in buckets: a_buckets.append(buckets[a_pids[i_r]]) else: orphaned_assessments += 1 a_buckets.append(-1) del a_ds print('orphaned_assessments:', orphaned_assessments) print(f'{bucket_count + 1} buckets') for i in range(bucket_count + 1): print('bucket', i) destination_filename = assessment_data[:-4] + f"_{i:04d}" + ".csv" print(destination_filename) # with open(assessment_data) as f: # a_ds = dataset.Dataset(f, filter_fn=lambda j: a_buckets[j] == i, progress=True) # # del a_ds assessment_splitter(assessment_data, destination_filename, a_buckets, i) print('done!')
self.u = max(self.u, day) self.c += 1 assessment_filter_count = 0 for i_p, p in enumerate(a_pids): if p in filtered_patients: assessment_filter_count += 1 day = utils.timestamp_to_day(a_updateds[i_p]) if filtered_patients[p] is None: filtered_patients[p] = AsmtEntry(day) else: filtered_patients[p].add(day) updated_ats = [v.u for v in filtered_patients.values() if v is not None] asmt_counts = [v.c for v in filtered_patients.values() if v is not None] # print(updated_ats) h_updated_ats = sorted(utils.build_histogram(updated_ats)) print(h_updated_ats) sumv = 0 for h in h_updated_ats: sumv += h[1] print(h[0], 31310 - sumv) print(sumv) h_asmt_counts = sorted(utils.build_histogram(asmt_counts)) # print(h_asmt_counts) print(assessment_filter_count - sumv) for h in h_asmt_counts: print(h[0], h[1]) print('assessments filtered by patient_filtering:', assessment_filter_count)
# Copyright 2020 KCL-BMEIS - King's College London # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import dataset import pipeline import utils patients_filename = '/home/ben/covid/patients_export_geocodes_20200423050002.csv' assessments_filename = '/home/ben/covid/assessments_export_20200423050002.csv' #fn = '/home/ben/covid/assessments_short.csv' print(f'loading {patients_filename}') with open(patients_filename) as f: ds = dataset.Dataset(f, progress=True) print(utils.build_histogram(ds.field_by_name('version'))) print(f'loading {assessments_filename}') with open(assessments_filename) as f: ds = dataset.Dataset(f, progress=True) print(utils.build_histogram(ds.field_by_name('version')))
# ============================================================= import random import utils def choose_from_hist(histogram): swap_map = utils.swap_counter_and_value_histogram(histogram) counter_list = list(swap_map.keys()) max_counter = max(counter_list) min_counter = min(counter_list) res = None rand = random.randint(min_counter, max_counter) while not res: res = swap_map[rand] rand = random.randint(min_counter, max_counter) return (res, rand, utils.sumall(counter_list)) if __name__ == "__main__": t = ['a', 'a', 'b'] histogram = utils.build_histogram(t) res = choose_from_hist(histogram) print("The choosen one is {} with probability of {}/{}".format( res[0], res[1], res[2])) # OR using built-in function rand_word = random.choice(list(histogram.keys())) print("Random word using built-in function:", rand_word)
'past_symptom_anosmia', 'past_symptom_shortness_of_breath', 'past_symptom_fatigue', 'past_symptom_fever', 'past_symptom_skipped_meals', 'past_symptom_persistent_cough', 'past_symptom_diarrhoea', 'past_symptom_chest_pain', 'past_symptom_hoarse_voice', 'past_symptom_abdominal_pain', 'past_symptom_delirium') with open(patient_filename) as f: ds = dataset.Dataset(f, keys=core_keys + past_symptom_keys, progress=True) # progress=True, stop_after=999999) ds.sort(('created_at', 'id')) for p in past_symptom_keys: field = ds.field_by_name(p) if p == 'past_symptoms_days_ago': histogram = build_histogram(field) nones = None for h in histogram: if h[0] is '': nones = h[1] histogram = [(int(v[0]), v[1]) for v in build_histogram(field) if v[0] is not ''] if nones is not None: histogram = [(None, nones)] + sorted(histogram) else: histogram = sorted(histogram) print(f"{p}:", histogram) else: print(f"{p}:", sorted(build_histogram(field))) p_ids = ds.field_by_name('id')
def check_other_symptoms(input_filename): with open(input_filename) as f: ds = dataset.Dataset(f, keys=('patient_id', 'updated_at', 'other_symptoms', 'treatment', 'location'), progress=True) # progress=True, stop_after=2999999) by_patient = defaultdict(Locations) p_id = ds.field_by_name('patient_id') other = ds.field_by_name('other_symptoms') treatment = ds.field_by_name('treatment') location = ds.field_by_name('location') word_dict = defaultdict(int) _2ple_dict = defaultdict(int) _3ple_dict = defaultdict(int) other_symptoms_empty = 0 other_treatment_empty = 0 table = str.maketrans('', '', string.punctuation) for i_r in range(len(other)): by_patient[p_id[i_r]].add(location[i_r]) if other[i_r] == '': other_symptoms_empty += 1 else: # split and clean words, then add to dictionary words = other[i_r].split() cwords = [(w.lower()).translate(table) for w in words] cwords = [w for w in cwords if w != ''] for c in cwords: word_dict[c] += 1 for i_c in range(len(cwords) - 1): _2ple_dict[(cwords[i_c], cwords[i_c + 1])] += 1 for i_c in range(len(cwords) - 2): _3ple_dict[(cwords[i_c], cwords[i_c + 1], cwords[i_c + 2])] += 1 for i_r in range(len(treatment)): if treatment[i_r] == '': other_treatment_empty += 1 by_patient_values = [x for x in by_patient.values()] by_patient_hist = utils.build_histogram([(v.hosp, v.bfhosp) for v in by_patient_values]) by_patient_hist = sorted(by_patient_hist, reverse=True) print(by_patient_hist) print(utils.build_histogram(location)) print('other_symptoms - non-empty', ds.row_count() - other_symptoms_empty) print('other_treatment - non-empty', ds.row_count() - other_treatment_empty) by_max_freq = sorted([w for w in word_dict.items()], key=lambda x: (-x[1], x[0])) by_max_freq_2ple =\ sorted([w for w in _2ple_dict.items()], key=lambda x: (-x[1], x[0])) by_max_freq_3ple =\ sorted([w for w in _3ple_dict.items()], key=lambda x: (-x[1], x[0])) threshold = 100 for w in by_max_freq: if w[1] < threshold: break print(w[0], w[1]) for w in by_max_freq_2ple: if w[1] < threshold: break print(w[0], w[1]) for w in by_max_freq_3ple: if w[1] < threshold: break print(w[0], w[1]) for i_r in range(len(treatment)): if ',' in treatment[i_r]: print(i_r, treatment[i_r])