def gen_sepsis_json_data(): vital_file = args.vital_file patient_time_record_dict = dict() feature_index_dict = py_op.myreadjson(os.path.join(args.result_dir, 'feature_index_dict.json')) index_feature_list = py_op.myreadjson(os.path.join(args.result_dir, 'index_feature_list.json')) feature_value_order_dict = py_op.myreadjson(os.path.join(args.result_dir, 'feature_value_order_dict.json')) feature_value_order_dict = { str(feature_index_dict[k]):v for k,v in feature_value_order_dict.items() if 'time' not in k} patient_time_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_time_dict.json')) # return for i_line, line in enumerate(open(vital_file)): if i_line and i_line % 10000 == 0: print('line', i_line) if i_line: data = line.strip().split(',') patient, time = data[:2] time = time_to_min(time) if patient not in patient_time_dict: continue if time > patient_time_dict[patient]: continue time = int(float(time)) if patient not in patient_time_record_dict: patient_time_record_dict[patient] = dict() data = data[2:] vs = dict() for idx, val in enumerate(data): if len(val) == 0: continue value_order = feature_value_order_dict[str(idx)] vs[idx] = float('{:3.3f}'.format(value_order[val])) patient_time_record_dict[patient][time - patient_time_dict[patient] - 1] = vs with open(os.path.join(args.result_dir, 'sepsis_time_record_dict.json'), 'w') as f: f.write(json.dumps(patient_time_record_dict))
def analyze_sepsis(): sepsis_time_record_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_time_record_dict.json')) sepsis_label_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_label_dict.json')) print(len(sepsis_time_record_dict)) n = 0 np = 0 d = { 30: 0, 60: 0, 120: 0, } for p,vd in sepsis_time_record_dict.items(): if sepsis_label_dict[p]: n += 1 else: continue pass min_t = - int(min(vd.keys())) for k in d: if min_t < k: d[k] += 1 print(n) print(d) sepsis_label_dict = { k:v for k,v in sepsis_label_dict.items() if k in sepsis_time_record_dict }
def compare_sepsis(): print('reading') sepsis_label_dict = py_op.myreadjson( os.path.join(args.result_dir, 'sepsis_label_dict.json')) print('reading') patient_label_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_label_dict.json')) print(len(set(sepsis_label_dict) & set(patient_label_dict))) # sepsis_label_dict = [k for k,v in sepsis_label_dict.items() if v ] print(len(set(sepsis_label_dict) & set(patient_label_dict))) d = dict() for p, l in sepsis_label_dict.items(): if p not in patient_label_dict: continue if l == 0: d[p] = 0 else: d[p] = 1 print(len(d)) print(sum(d.values())) py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_label_dict.json'), d) sepsis_time_dict = py_op.myreadjson( os.path.join(args.result_dir, 'sepsis_time_dict.json')) sepsis_time_dict = {k: v for k, v in sepsis_time_dict.items() if k in d} print(len(sepsis_time_dict)) py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'), sepsis_time_dict)
def ana_feat_dist(task): n_split = 100 feature_label_count = np.zeros((143, 2, n_split)) patient_time_record_dict = py_op.myreadjson( os.path.join(args.result_dir, 'json_data', '{:s}.json'.format(args.task))) patient_label_dict = py_op.myreadjson( os.path.join(args.file_dir, 'patient_label_dict.{:s}.json'.format(args.task))) [[[0. for _ in range(n_split)], [0. for _ in range(n_split)]] for i in range(143)] for ip, (p, t_dict) in enumerate(patient_time_record_dict.items()): if ip % 10000 == 0: print ip, len(patient_time_record_dict) label = patient_label_dict[p] for t, vs in t_dict.items(): for v in vs: feature, value = v idx = int(value * n_split) feature_label_count[feature, label, idx] += 1 for f in range(143): for l in range(2): feature_label_count[feature, label] /= feature_label_count[feature, label].sum() np.save('../file/feature_label_count.npy', feature_label_count)
def gen_json_data(): vital_file = args.vital_file patient_time_record_dict = dict() feature_index_dict = py_op.myreadjson( os.path.join(args.file_dir, 'feature_index_dict.json')) feature_value_order_dict = py_op.myreadjson( os.path.join(args.file_dir, 'feature_value_order_dict.json')) feature_value_order_dict = { str(feature_index_dict[k]): v for k, v in feature_value_order_dict.items() if 'event' not in k } index_group_dict = py_op.myreadjson( os.path.join(args.file_dir, 'index_group_dict.json')) patient_time_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_time_dict.json')) mx_time = -100 for i_line, line in enumerate(open(vital_file)): if i_line % 10000 == 0: print 'line', i_line if 'event_time' not in line: data = line.strip().split(',') patient, time = data[:2] time = int(float(time)) mx_time = max(mx_time, time) if patient not in patient_time_record_dict: patient_time_record_dict[patient] = dict() if time not in patient_time_record_dict[patient]: patient_time_record_dict[patient][time] = dict() data = data[2:] vs = dict() for idx, val in enumerate(data): if len(val) == 0: continue if str(idx) in index_group_dict: idx = index_group_dict[str(idx)] value_order = feature_value_order_dict[str(idx)] vs[idx] = value_order[val] patient_time_record_dict[patient][time].update(vs) new_d = dict() for p, tr in patient_time_record_dict.items(): new_d[p] = dict() for t, vs in tr.items(): if mx_time > 0: t = int(t - patient_time_dict[p] - 4) if t < -102: continue nvs = [] for k in sorted(vs.keys()): nvs.append([k, vs[k]]) new_d[p][t] = nvs with open(os.path.join(args.result_dir, 'patient_time_record_dict.json'), 'w') as f: # f.write(json.dumps(new_d, indent=4)) f.write(json.dumps(new_d))
def analyse_variation_trend(task='task1'): ''' generate new vital file ''' feature_variation_trend_dict = dict() feature_value_order_dict = py_op.myreadjson(os.path.join(args.file_dir, 'feature_value_order_dict.{:s}.json'.format(task))) patient_time_dict = py_op.myreadjson(os.path.join(args.file_dir, 'patient_time_dict.json')) task_dir = os.path.join(args.data_dir, 'sepsis2_{:s}_training'.format(task)) vital_file = os.path.join(task_dir, 'sepsis2_{:s}_vital_training.csv'.format(task)) vital_dict = { } # key-valuelist-dict last_patient = '' feature_time_value_dict = dict() for i_line,line in enumerate(open(vital_file)): if i_line % 10000 == 0: print i_line if i_line: ctt_list = line.strip().split(',')[2:] new_ctt = line.strip().split(',')[:2] if task == 'task1': patient, time = new_ctt new_time = float(time) - patient_time_dict[patient] - 4.0 new_ctt = [patient, str(new_time)] patient, time = new_ctt time = int(float(time)) if patient != last_patient: for feature, tv in feature_time_value_dict.items(): if len(tv) > 4: ts = sorted(tv.keys()) vs = [tv[t] for t in ts] feature_variation_trend_dict[feature] = feature_variation_trend_dict.get(feature, []) + [[ts, vs]] if i_line >= 500000: break feature_time_value_dict = dict() last_patient = patient for idx, value in enumerate(ctt_list): if len(value.strip()): value = float(value.strip()) if idx not in feature_time_value_dict: feature_time_value_dict[idx] = { } feature_time_value_dict[idx][time] = value # py_op.mywritejson(os.path.join(args.file_dir, 'feature_variation_trend_dict.json'), feature_variation_trend_dict) with open (os.path.join(args.file_dir, 'feature_variation_trend_dict.json'), 'w') as f: f.write(json.dumps(feature_variation_trend_dict))
def feature_change(): print('reading') patient_time_record_dict = json.load(open(os.path.join(args.result_dir, 'patient_time_record_dict.json'))) print(patient_time_record_dict.keys()) patient_label_dict = py_op.myreadjson(os.path.join(args.result_dir, 'patient_label_dict.json')) feature_list_dict = { str(i): [] for i in range(8) } for p in patient_time_record_dict: if p in patient_label_dict and patient_label_dict[p]: tr = patient_time_record_dict[p] last_v = { } for rs in tr.values(): for i,v in rs.items(): if i in last_v: feature_list_dict[i].append(abs(v - last_v[i])) last_v[i] = v for f,l in feature_list_dict.items(): l = sorted(l) ds = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0] ds = [0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0] ns = [] for i,delta in enumerate(l): if delta > ds[0]: ns.append(float('{:3.2f}'.format(1.0*i/len(l)))) ds = ds[1:] ns.append(len(l)) print(f, ns)
def split_data(): patient_label_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_label_dict.json')) # patients = patient_label_dict.keys() # patients = sorted(patients) patients = py_op.myreadjson( os.path.join(args.result_dir, 'patient_list.json')) n = int(len(patients) * 0.8) patient_train = patients[:n] patient_valid = patients[n:] py_op.mywritejson(os.path.join(args.result_dir, 'train.json'), patient_train) py_op.mywritejson(os.path.join(args.result_dir, 'valid.json'), patient_valid) print(sum([patient_label_dict[k] for k in patient_train])) print(sum([patient_label_dict[k] for k in patient_valid])) print(len([patient_label_dict[k] for k in patient_train]))
def ana_patient(): fo = '/home/yin/comparison' for task in os.listdir(fo): print '\n', fo task_dir = os.path.join(fo, task) task_dir = os.path.join(task_dir, os.listdir(task_dir)[-1]) for fi in os.listdir(task_dir): patients = py_op.myreadjson(os.path.join(task_dir, fi)) print fi, len(patients)
def gen_normal_range_order(): feature_value_order_dict = py_op.myreadjson(os.path.join(args.result_dir, 'feature_value_order_dict.json')) index_vital_list = py_op.myreadjson(os.path.join(args.result_dir, 'index_feature_list.json')) vital_normal_range_dict = py_op.myreadjson(os.path.join(args.result_dir, 'vital_normal_range_dict.json')) feature_normal_range_order_dict = { } for feature, d in feature_value_order_dict.items(): if 'time' in feature: continue normal_range = vital_normal_range_dict[feature] values = sorted(d.keys(), key = lambda s:float(s)) feature_normal_range_order_dict[feature] = [] for v in values: if float(v) > normal_range[0] and len(feature_normal_range_order_dict[feature]) == 0: feature_normal_range_order_dict[feature].append(d[v]) if float(v) > normal_range[1] and len(feature_normal_range_order_dict[feature]) == 1: feature_normal_range_order_dict[feature].append(d[v]) break print(feature_normal_range_order_dict) py_op.mywritejson(os.path.join(args.result_dir, 'feature_normal_range_order_dict.json'), feature_normal_range_order_dict)
def stati_mse_diff_distribution(clean_dir, stain_dir, pred_dir, stati_bad=False, result_json='../../data/result/7_311.json', use_max=False): ''' 统计生成图片中仍然存在的MSE主要由哪些区域产生 分为多个区域 以threshhold为界 stati_bad: 仅仅统计生成较差的图片结果 result_json: 预测结果统计json文件 ''' if stati_bad: result_dict = py_op.myreadjson(result_json) result_list = sorted([(v, k) for k, v in result_dict.items() ])[:int(0.1 * len(result_dict))] result_list = set([k.split('.')[0] for v, k in result_list]) threshhold_list = [0, 2, 6, 20, 60, 100, 150, 250] mse_sum = np.zeros(len(threshhold_list)) mask_sum = np.zeros(len(threshhold_list)) for fi in tqdm(os.listdir(pred_dir)): if stati_bad: if fi.split('.')[0] not in result_list: continue pred_fi = os.path.join(pred_dir, fi) clean_fi = os.path.join(clean_dir, fi.replace('png', 'jpg')) stain_fi = os.path.join(stain_dir, fi.replace('.png', '_.jpg')) try: pred_image = np.array(Image.open(pred_fi).resize( (250, 250))).astype(np.float32) clean_image = np.array(Image.open(clean_fi).resize( (250, 250))).astype(np.float32) stain_image = np.array(Image.open(stain_fi).resize( (250, 250))).astype(np.float32) mask = get_mask( np.abs(clean_image.astype(np.float32) - stain_image), threshhold_list, use_max) except: continue for n in range(len(threshhold_list)): mask_sum[n] += (mask == n).sum() mse_sum[n] += ((clean_image[mask == n] - pred_image[mask == n])**2).sum() print '生成的图片主要的mse分布' print '灰度差别 \t 区域占比例 \t mse占比例' for n in range(len(threshhold_list) - 1): threshhold_min = threshhold_list[n] threshhold_max = threshhold_list[n + 1] print '[{:d}, {:d}] \t {:2.2f} \t\t {:2.2f}'.format( threshhold_min, threshhold_max, mask_sum[n] / sum(mask_sum), mse_sum[n] / sum(mse_sum))
def ana_data_similar(): def get_master(task): if task == 'task2': master_file = '/home/yin/contestdata2/DII_sepsis2_task2_evaluation/sepsis2_task2_evaluation_master.csv' elif task == 'case1': master_file = '/home/yin/contestdata2/DII_sepsis2_task1_evaluation/sepsis2_task1_evaluation_case1_master.csv' else: master_file = '/home/yin/contestdata2/DII_sepsis2_task1_evaluation/sepsis2_task1_evaluation_case2_master.csv' master_pid_dict = dict() for i,line in enumerate(open(master_file)): if i == 0: continue pid = line.split(',')[0] # master = line.replace(pid+',', '') master = line[len(pid) + 1:] master = ''.join(master.split()) master_pid_dict[master] = master_pid_dict.get(master, []) + [pid] return master_pid_dict task_master_pid_dict = dict() task_patient_data = py_op.myreadjson('../result/task_patient_data.json') for k in ['case1', 'case2', 'task2']: task_master_pid_dict[k] = get_master(k) kf = 'case1' ks = 'task2' ks = 'case2' master_set = set(task_master_pid_dict[kf]) & set(task_master_pid_dict[ks]) cset = set() n = 0 for master in master_set: pc = task_master_pid_dict[kf][master] pt = task_master_pid_dict[ks][master] if len(pc) + len(pt) >= 2: for ppc in pc: n += 1 for ppt in pt: ppc_data = set(task_patient_data[kf][ppc]) ppt_data = set(task_patient_data[ks][ppt]) same = 0 for cline in ppc_data: for tline in ppt_data: if cline == tline: # print ppc, ppt # cset.add(ppc) # print cline # print tline same += 1 if same > 5: print same, len(ppc_data), len(ppt_data) cset.add(ppc) print len(cset), n
def get_cases(): sepsis_label_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_label_dict.json')) print(len(sepsis_label_dict)) icu_file = '../data/icustays.csv' print('reading icustays.csv') icu_data = pd.read_csv(icu_file) icu_adm_dict = dict() icu_intime_dict = dict() for iline in range(len(icu_data)): icu = icu_data.loc[iline, 'icustay_id'] intime = icu_data.loc[iline, 'intime'] adm = icu_data.loc[iline, 'hadm_id'] icu_adm_dict[icu] = adm icu_intime_dict[icu] = time_to_min(intime) sepsis_label_dict = { k:0 for k in sepsis_label_dict } sepsis_time_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_time_dict.json')) for iline, line in enumerate(open('../data/sepsis_onset_time.csv')): icustay_id, h = line.strip().split(',') adm = icu_adm_dict[int(icustay_id)] sepsis_label_dict[adm] = 1 time = icu_intime_dict[int(icustay_id)] + 60 * int(h) sepsis_time_dict[adm] = time for iline, line in enumerate(open('../data/sepsis3_cases.csv')): break if iline: icustay_id,intime,outtime,length_of_stay,delta_score,sepsis_onset,sepsis_onset_day,sepsis_onset_hour = line.strip().split(',') adm = icu_adm_dict[int(icustay_id)] sepsis_label_dict[adm] = 1 time = time_to_min(sepsis_onset) sepsis_time_dict[adm] = time py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_label_dict.json'), sepsis_label_dict) py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'), sepsis_time_dict)
def draw_pic(): import numpy as np import matplotlib.pyplot as plt flc = np.load('../file/feature_label_count.npy') fvt = py_op.myreadjson(os.path.join(args.file_dir, 'feature_variation_trend_dict.json')) for f in range(143): vt = fvt[str(f)] print vt for i, (t, v) in enumerate(vt): plt.plot(t,v) if i > 10: break plt.savefig('../result/variation_trend/{:d}.png'.format(f)) plt.clf()
def test_all(): test_clean = '../data/test_clean' try: pred_dict = py_op.myreadjson('../data/result/result.json') except: pred_dict = dict() for i,pred_clean in enumerate(os.listdir(test_clean)): if pred_clean in pred_dict: if pred_dict[pred_clean] < 0.85: os.system('rm -r {:s}'.format(os.path.join(test_clean, pred_clean))) continue result = measures.compute_pred_clean_psnr(pred_clean,'../data/AI/testB', '../data/result') if result < 0.88: os.system('rm -r {:s}'.format(os.path.join(test_clean, pred_clean))) pred_dict[pred_clean] = result pred_dict = py_op.mysorteddict(pred_dict, key=lambda s:pred_dict[s]) py_op.mywritejson('../data/result/result.json',pred_dict)
def stati_gray_stain(clean_dir, stain_dir, pred_dir, stati_bad=True, result_json='../../data/result/7_311.json', use_max=False): ''' 统计不同灰度下的网纹点比例 ''' if stati_bad: result_dict = py_op.myreadjson(result_json) result_list = sorted([(v, k) for k, v in result_dict.items() ])[:int(0.1 * len(result_dict))] result_list = set([k.split('.')[0] for v, k in result_list]) threshhold_list = [0, 80, 120, 256] diff_sum = np.zeros(len(threshhold_list)) mask_sum = np.zeros(len(threshhold_list)) for fi in tqdm(os.listdir(pred_dir)): if stati_bad: if fi.split('.')[0] not in result_list: continue pred_fi = os.path.join(pred_dir, fi) clean_fi = os.path.join(clean_dir, fi.replace('png', 'jpg')) stain_fi = os.path.join(stain_dir, fi.replace('.png', '_.jpg')) try: clean_image = np.array(Image.open(clean_fi).resize( (250, 250))).astype(np.float32) stain_image = np.array(Image.open(stain_fi).resize( (250, 250))).astype(np.float32) mask = get_mask(stain_image, threshhold_list, use_max) except: traceback.print_exc() continue for n in range(len(threshhold_list)): mask_sum[n] += (mask == n).sum() diff_sum[n] += (np.abs(clean_image[mask == n] - stain_image[mask == n]) > 20).sum() print '灰度区域 \t 区域占比例 \t 网纹点占比例' for n in range(len(threshhold_list) - 1): threshhold_min = threshhold_list[n] threshhold_max = threshhold_list[n + 1] print '[{:d}, {:d}] \t {:2.2f} \t\t {:2.2f}'.format( threshhold_min, threshhold_max, mask_sum[n] / sum(mask_sum), diff_sum[n] / sum(diff_sum))
def scp_files(json_file): result_dict = py_op.myreadjson(json_file) score_jpg_list = sorted([(v, k) for k, v in result_dict.items()]) for score, jpg in score_jpg_list[:20]: png = jpg.replace('jpg', 'png') # png = + png image = Image.open('../../data/pred_clean/AI/testB/' + png) image = image.resize((250, 250)) tmp_png = 'tmp.png' image.save(tmp_png) cmd = 'scp {:s} ycclab:tmp/bad/{:s}'.format(tmp_png, png) os.system(cmd) os.remove(tmp_png) cmd = 'scp ../../data/AI/testB/{:s} ycclab:tmp/bad/'.format(jpg) os.system(cmd) cmd = 'scp ../../data/AI/testA/{:s} ycclab:tmp/bad/{:s}'.format( jpg.replace('.jpg', '_.jpg'), jpg.replace('.jpg', '.qs.jpg')) os.system(cmd) cmd = 'scp ../../data/pred_mask/AI/testB/{:s} ycclab:tmp/bad/{:s}'.format( png, png.replace('.png', '.rm.png')) os.system(cmd)
def ensemble(level=2): test_clean = '../data/test_clean/' clean_dir = '../data/AI/testB/' rgb_prob = np.load('../data/rgb_stati/rgb_prob_{:d}.npy'.format(level)) obj_dir = 'ensemble_1' pred_dict = py_op.myreadjson('../data/result/result.json') file_names = os.listdir(clean_dir) # pred_dir_list = [os.path.join(test_clean,d) for d in os.listdir(test_clean) if obj_dir not in d and pred_dict.get(d,0)>0.94 and pred_dict.get(d,0)<0.95] pred_dir_list = [os.path.join(test_clean,d) for d in os.listdir(test_clean) if obj_dir not in d and pred_dict.get(d,0)>0.94] if len(pred_dir_list) == 0: return # print pred_dir_list # return pool = multiprocessing.Pool(processes=15) for fi,file_name in enumerate(os.listdir(clean_dir)): pool.apply_async(write_ensemble_image, (fi, file_name,pred_dir_list, os.path.join(test_clean, obj_dir), rgb_prob, level, pred_dict)) # write_ensemble_image(fi, file_name,pred_dir_list, os.path.join(test_clean, obj_dir), rgb_prob, level, pred_dict) pool.close() pool.join() print 'processed all'
def gen_sepsis_label_dict(): sepsis_label_dict = dict() sepsis_file = '../data/sepsis3.csv' print('reading sepsis3.csv') sepsis_data = pd.read_csv(sepsis_file) sepsis_infection_dict = dict() sepsis_set = set() for iline in range(len(sepsis_data)): adm = sepsis_data.loc[iline, 'hadm_id'] adm = str(adm) excluded = sepsis_data.loc[iline, 'excluded'] suspected_infection_time_poe = sepsis_data.loc[ iline, 'suspected_infection_time_poe'] if len(str(suspected_infection_time_poe)) > 5: sepsis_infection_dict[adm] = time_to_min( suspected_infection_time_poe) # sepsis_set.add(adm) if excluded == 0: sepsis_set.add(adm) # if len(str(suspected_infection_time_poe)) > 5: # sepsis_infection_dict[adm] = time_to_min(suspected_infection_time_poe) # print(suspected_infection_time_poe) # if excluded == 0 and len(str(suspected_infection_time_poe)) > 0: # sepsis_set.add(adm) # return # print(len(sepsis_infection_dict)) # print(len(sepsis_set)) print('Infection No: {:d}'.format(len(sepsis_infection_dict))) print('Sepsis No: {:d}'.format(len(sepsis_set))) # py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'), patient_label_dict) # return icu_file = '../data/icustays.csv' print('reading icustays.csv') icu_data = pd.read_csv(icu_file) icu_adm_dict = dict() for iline in range(len(icu_data)): icu = icu_data.loc[iline, 'icustay_id'] adm = icu_data.loc[iline, 'hadm_id'] icu_adm_dict[icu] = adm sofa_file = '../data/sofa.csv' print('reading sofa.csv') sofa_data = pd.read_csv(sofa_file) print('mapping sofa to adm') adm_sofa_dict = dict() for iline in range(len(sofa_data)): break if iline and iline % 10000 == 0: print('mapping sofa to adm', iline, len(sofa_data)) icu = sofa_data.loc[iline, 'icustay_id'] sofa = sofa_data.loc[iline, 'sofa_24hours'] starttime = sofa_data.loc[iline, 'starttime'] endtime = sofa_data.loc[iline, 'endtime'] adm = icu_adm_dict[icu] adm_sofa_dict[adm] = adm_sofa_dict.get( adm, []) + [[sofa, starttime, endtime]] # py_op.mywritejson('../result/adm_sofa_dict.json', adm_sofa_dict) # return adm_sofa_dict = py_op.myreadjson('../result/adm_sofa_dict.json') print('set sepsis label') pos_num = 0 for iline, (adm, sofa_list) in enumerate(adm_sofa_dict.items()): # print(adm, type(adm)) if iline and iline % 10000 == 0: print('set sepsis label', iline, len(adm_sofa_dict)) # if adm not in sepsis_infection_dict: if adm in sepsis_infection_dict: sepsis_label_dict[adm] = [0, sepsis_infection_dict[adm]] else: continue if adm not in sepsis_set: continue # sofa_list = sofa_list # if time_to_min(sofa_list[0][1]) < sepsis_infection_dict[adm] : # continue # print('have data') sofa_init = '' for sofa in sofa_list: starttime = sofa[1] endtime = sofa[2] time = time_to_min(endtime) sofa = int(sofa[0]) if time - sepsis_infection_dict[ adm] >= -48 * 60 and time - sepsis_infection_dict[ adm] <= 24 * 60: if sofa_init == '': sofa_init = sofa elif sofa - sofa_init >= 2 and sofa >= 2: sepsis_label_dict[adm] = [1, sepsis_infection_dict[adm]] sepsis_infection_dict[adm] = max( time, sepsis_infection_dict[adm]) pos_num += 1 break print('writing sepsis_label_dict') py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'), sepsis_infection_dict) py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_label_dict.json'), {k: v[0] for k, v in sepsis_label_dict.items()}) print('There are {:d} positive samples.'.format(pos_num)) print('There are {:d} negtive samples.'.format( len(sepsis_label_dict) - pos_num))
def gen_feature_order_dict(): ''' generate the order of value for each feature ''' feature_value_order_dict = dict() # vital information vital_file = args.vital_file vital_dict = {} # key-valuelist-dict for i_line, line in enumerate(open(vital_file)): if i_line % 10000 == 0: print i_line # if i_line > 10000: # break if i_line == 0: new_line = '' vis = 0 for c in line: if c == '"': vis = (vis + 1) % 2 if vis == 1 and c == ',': c = ';' new_line += c line = new_line col_list = line.strip().split(',')[1:] for col in col_list: vital_dict[col] = [] else: ctt_list = line.strip().split(',')[1:] assert len(ctt_list) == len(col_list) for col, ctt in zip(col_list, ctt_list): if len(ctt): vital_dict[col].append(float(ctt)) # if i_line > 10000: # break # if i_line % 10000 == 0: # print i_line # add group info groups = py_op.myreadjson(os.path.join(args.file_dir, 'similar.json')) feature_index_dict = py_op.myreadjson( os.path.join(args.file_dir, 'feature_index_dict.json')) index_feature_list = py_op.myreadjson( os.path.join(args.file_dir, 'index_feature_list.json')) for g in groups: for k in g: mg = min(g) if k != mg: kf = index_feature_list[k] mf = index_feature_list[mg] vital_dict[mf] = vital_dict[mf] + vital_dict[kf] vital_dict.pop(kf) print 'features', len(vital_dict) # feature_count_dict = { k: len(v) for k,v in vital_dict.items() } # py_op.mywritejson(os.path.join(args.file_dir, 'feature_count_dict.json'), feature_count_dict) ms_list = [] for col in col_list: if col not in vital_dict: continue value_list = sorted(vital_dict[col]) value_order_dict = dict() value_minorder_dict = dict() value_maxorder_dict = dict() for i_value, value in enumerate(value_list): if value not in value_minorder_dict: value_minorder_dict[value] = i_value if value == value_list[-1]: value_maxorder_dict[value] = len(value_list) - 1 break if value != value_list[i_value + 1]: value_maxorder_dict[value] = i_value for value in value_maxorder_dict: value_order_dict[value] = ( value_maxorder_dict[value] + value_minorder_dict[value]) / 2.0 / len(value_list) feature_value_order_dict[col] = value_order_dict py_op.mywritejson( os.path.join(args.file_dir, 'feature_value_order_dict.json'), feature_value_order_dict)
def main(): p_dict = dict() # All the parameters p_dict['args'] = args args.split_nn = args.split_num + args.split_nor * 3 args.vocab_size = args.split_nn * 145 + 1 print 'vocab_size', args.vocab_size ### load data print 'read data ...' patient_time_record_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_time_record_dict.json')) patient_master_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_master_dict.json')) patient_label_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_label_dict.json')) patient_train = list( json.load(open(os.path.join(args.file_dir, args.task, 'train.json')))) patient_valid = list( json.load(open(os.path.join(args.file_dir, args.task, 'val.json')))) if len(patient_train) > len(patient_label_dict): patients = patient_time_record_dict.keys() patients = patient_label_dict.keys() n = int(0.8 * len(patients)) patient_train = patients[:n] patient_valid = patients[n:] print 'data loading ...' train_dataset = dataloader.DataSet(patient_train, patient_time_record_dict, patient_label_dict, patient_master_dict, args=args, phase='train') train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) val_dataset = dataloader.DataSet(patient_valid, patient_time_record_dict, patient_label_dict, patient_master_dict, args=args, phase='val') val_loader = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=True) p_dict['train_loader'] = train_loader p_dict['val_loader'] = val_loader cudnn.benchmark = True net = lstm.LSTM(args) if args.gpu: net = net.cuda() p_dict['loss'] = loss.Loss().cuda() else: p_dict['loss'] = loss.Loss() parameters = [] for p in net.parameters(): parameters.append(p) optimizer = torch.optim.Adam(parameters, lr=args.lr) p_dict['optimizer'] = optimizer p_dict['model'] = net start_epoch = 0 # args.epoch = start_epoch # print ('best_f1score' + str(best_f1score)) p_dict['epoch'] = 0 p_dict['best_metric'] = [0, 0] ### resume pretrained model if os.path.exists(args.resume): print 'resume from model ' + args.resume function.load_model(p_dict, args.resume) print 'best_metric', p_dict['best_metric'] # return if args.phase == 'train': best_f1score = 0 for epoch in range(p_dict['epoch'] + 1, args.epochs): p_dict['epoch'] = epoch for param_group in optimizer.param_groups: param_group['lr'] = args.lr train_eval(p_dict, 'train') train_eval(p_dict, 'val')
def main(): p_dict = dict() # All the parameters p_dict['args'] = args args.split_nn = 3 * 5 args.vocab_size = args.split_nn * 145 + 2 print('vocab_size', args.vocab_size) ### load data print('read data ...') if args.task == 'mortality': patient_time_record_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_time_record_dict.json')) patient_master_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_master_dict.json')) patient_label_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_label_dict.json')) if os.path.exists(os.path.join(args.result_dir, 'train.json')): patient_train = list( json.load(open(os.path.join(args.result_dir, 'train.json')))) patient_valid = list( json.load(open(os.path.join(args.result_dir, 'valid.json')))) patient_test = list( json.load(open(os.path.join(args.result_dir, 'test.json')))) else: patients = sorted( set(patient_label_dict.keys()) & set(patient_time_record_dict) & set(patient_master_dict)) print(len(patient_master_dict), len(patient_label_dict), len(patient_time_record_dict)) print('There are {:d} patients.'.format(len(patients))) n_train = int(0.7 * len(patients)) n_valid = int(0.2 * len(patients)) patient_train = patients[:n_train] patient_valid = patients[n_train:n_train + n_valid] patient_test = patients[n_train + n_valid:] args.master_size = len(patient_master_dict[patients[0]]) elif args.task == 'sepsis': patient_time_record_dict = py_op.myreadjson( os.path.join(args.result_dir, 'sepsis_time_record_dict.json')) patient_master_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_master_dict.json')) patient_label_dict = py_op.myreadjson( os.path.join(args.result_dir, 'sepsis_label_dict.json')) sepsis_split = py_op.myreadjson( os.path.join(args.result_dir, 'sepsis_split.json')) print(sepsis_split.keys()) sepsis_split = sepsis_split[str(-args.last_time)] patient_train = sepsis_split['train'] patient_valid = sepsis_split['valid'] print('train: {:d}'.format(len(patient_train))) print('valid: {:d}'.format(len(patient_valid))) print('data loading ...') train_dataset = dataloader.DataSet(patient_train, patient_time_record_dict, patient_label_dict, patient_master_dict, args=args, phase='train') train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) val_dataset = dataloader.DataSet(patient_valid, patient_time_record_dict, patient_label_dict, patient_master_dict, args=args, phase='val') val_loader = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) test_dataset = dataloader.DataSet(patient_test, patient_time_record_dict, patient_label_dict, patient_master_dict, args=args, phase='val') test_loader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) p_dict['train_loader'] = train_loader if args.phase == 'train': p_dict['val_loader'] = val_loader else: p_dict['val_loader'] = test_loader cudnn.benchmark = True net = attention.Attention(args) if args.gpu: net = net.cuda() p_dict['loss'] = loss.Loss().cuda() else: p_dict['loss'] = loss.Loss() parameters = [] for p in net.parameters(): parameters.append(p) optimizer = torch.optim.Adam(parameters, lr=args.lr) p_dict['optimizer'] = optimizer p_dict['model'] = net start_epoch = 0 # args.epoch = start_epoch # print ('best_f1score' + str(best_f1score)) p_dict['epoch'] = 0 p_dict['best_metric'] = [0, 0] ### resume pretrained model if os.path.exists(args.resume): print('resume from model ' + args.resume) function.load_model(p_dict, args.resume) print('best_metric', p_dict['best_metric']) if args.phase == 'train': best_f1score = 0 for epoch in range(p_dict['epoch'] + 1, args.epochs): p_dict['epoch'] = epoch for param_group in optimizer.param_groups: param_group['lr'] = args.lr train_eval(p_dict, 'train') train_eval(p_dict, 'val') log_info = '# task : {:s}; model: {:s} ; last_time: {:d} ; auc: {:3.4f} \n'.format( args.task, args.model, args.last_time, p_dict['best_metric'][0]) with open('../result/log.txt', 'a') as f: f.write(log_info) else: train_eval(p_dict, 'test')