def ana_patient(): def get_patients(task): if task == 'task2': master_file = '/home/yin/contestdata2/DII_sepsis2_task2_evaluation/sepsis2_task2_evaluation_master.csv' elif task == 'case1': master_file = '/home/yin/contestdata2/DII_sepsis2_task1_evaluation/sepsis2_task1_evaluation_case1_master.csv' else: master_file = '/home/yin/contestdata2/DII_sepsis2_task1_evaluation/sepsis2_task1_evaluation_case2_master.csv' pids = set() for i,line in enumerate(open(master_file)): if i == 0: # print line continue pid = line.split(',')[0] pids.add(pid) return pids pids_case1 = get_patients('case1') pids_case2 = get_patients('case2') pids_task2 = get_patients('task2') print 'case1', len(pids_case1), len(pids_case1 & pids_case2) print 'case2', len(pids_case2) print 'task2', len(pids_task2), len(pids_task2 & pids_case2) print pids_task2 & pids_case2 test_patient_dict = { 'case1': sorted(pids_case1), 'task1': sorted(pids_case2), 'task2': sorted(pids_task2) } py_op.mywritejson(os.path.join(args.file_dir, 'test_patient_dict.json'), test_patient_dict)
def compare_sepsis(): print('reading') sepsis_label_dict = py_op.myreadjson( os.path.join(args.result_dir, 'sepsis_label_dict.json')) print('reading') patient_label_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_label_dict.json')) print(len(set(sepsis_label_dict) & set(patient_label_dict))) # sepsis_label_dict = [k for k,v in sepsis_label_dict.items() if v ] print(len(set(sepsis_label_dict) & set(patient_label_dict))) d = dict() for p, l in sepsis_label_dict.items(): if p not in patient_label_dict: continue if l == 0: d[p] = 0 else: d[p] = 1 print(len(d)) print(sum(d.values())) py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_label_dict.json'), d) sepsis_time_dict = py_op.myreadjson( os.path.join(args.result_dir, 'sepsis_time_dict.json')) sepsis_time_dict = {k: v for k, v in sepsis_time_dict.items() if k in d} print(len(sepsis_time_dict)) py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'), sepsis_time_dict)
def gen_patient_time_dict_dii(): vital_file = args.vital_file patient_time_dict = dict() for i_line, line in enumerate(open(vital_file)): if i_line: patient, time = line.strip().split(',')[:2] patient_time_dict[patient] = max(patient_time_dict.get(patient, 0), float(time)) py_op.mywritejson(os.path.join(args.result_dir, 'patient_time_dict.json'), patient_time_dict)
def gen_patient_label_dict(): patient_label_dict = dict() label_file = args.label_file for i_line, line in enumerate(open(label_file)): if i_line != 0: data = line.strip().split(',') patient = data[0] label = data[-1] patient_label_dict[patient] = int(label) py_op.mywritejson(os.path.join(args.result_dir, 'patient_label_dict.json'), patient_label_dict)
def gen_feature_index(): vital_file = args.vital_file for i_line, line in enumerate(open(vital_file)): line = line.replace('"', '') index_feature_list = line.strip().split(',')[2:] break feature_index_dict = {f: i for i, f in enumerate(index_feature_list)} py_op.mywritejson(os.path.join(args.result_dir, 'feature_index_dict.json'), feature_index_dict) py_op.mywritejson(os.path.join(args.result_dir, 'index_feature_list.json'), index_feature_list)
def gen_patient_time_dict(): vital_file = args.vital_file patient_time_dict = dict() for i_line, line in enumerate(open(vital_file)): if i_line % 10000 == 0: print(i_line) if i_line: patient, time = line.strip().split(',')[:2] time = time_to_min(time) patient_time_dict[patient] = max(patient_time_dict.get(patient, 0), time) py_op.mywritejson(os.path.join(args.result_dir, 'patient_time_dict.json'), patient_time_dict)
def test_all(): test_clean = '../data/test_clean' try: pred_dict = py_op.myreadjson('../data/result/result.json') except: pred_dict = dict() for i,pred_clean in enumerate(os.listdir(test_clean)): if pred_clean in pred_dict: if pred_dict[pred_clean] < 0.85: os.system('rm -r {:s}'.format(os.path.join(test_clean, pred_clean))) continue result = measures.compute_pred_clean_psnr(pred_clean,'../data/AI/testB', '../data/result') if result < 0.88: os.system('rm -r {:s}'.format(os.path.join(test_clean, pred_clean))) pred_dict[pred_clean] = result pred_dict = py_op.mysorteddict(pred_dict, key=lambda s:pred_dict[s]) py_op.mywritejson('../data/result/result.json',pred_dict)
def gen_patient_label_dict(): patient_label_dict = dict() label_file = args.label_file for i_line, line in enumerate(open(label_file)): if i_line != 0: data = line.strip().split(',') patient = str(int(float(data[0]))) # patient = data[0] label = data[-1] patient_label_dict[patient] = int(float(label)) py_op.mywritejson(os.path.join(args.result_dir, 'patient_label_dict.json'), patient_label_dict) print('There are {:d} positive samples.'.format( sum(patient_label_dict.values()))) print('There are {:d} negtive samples.'.format( len(patient_label_dict) - sum(patient_label_dict.values())))
def split_data(): patient_label_dict = py_op.myreadjson( os.path.join(args.result_dir, 'patient_label_dict.json')) # patients = patient_label_dict.keys() # patients = sorted(patients) patients = py_op.myreadjson( os.path.join(args.result_dir, 'patient_list.json')) n = int(len(patients) * 0.8) patient_train = patients[:n] patient_valid = patients[n:] py_op.mywritejson(os.path.join(args.result_dir, 'train.json'), patient_train) py_op.mywritejson(os.path.join(args.result_dir, 'valid.json'), patient_valid) print(sum([patient_label_dict[k] for k in patient_train])) print(sum([patient_label_dict[k] for k in patient_valid])) print(len([patient_label_dict[k] for k in patient_train]))
def gen_patient_master_dict(master_list): patient_master_dict = dict() # master information master_file = args.master_file master_set = [set() for _ in range(6)] for i_line, line in enumerate(open(master_file)): if i_line != 0: data = line.strip().split(',') patient = data[0] feature = ['0' for _ in range(43)] for i, d in enumerate(data[1:]): m = str(i) + d idx = master_list.index(m) feature[idx] = '1' patient_master_dict[patient] = ''.join(feature) py_op.mywritejson( os.path.join(args.result_dir, 'patient_master_dict.json'), patient_master_dict)
def gen_normal_range_order(): feature_value_order_dict = py_op.myreadjson(os.path.join(args.result_dir, 'feature_value_order_dict.json')) index_vital_list = py_op.myreadjson(os.path.join(args.result_dir, 'index_feature_list.json')) vital_normal_range_dict = py_op.myreadjson(os.path.join(args.result_dir, 'vital_normal_range_dict.json')) feature_normal_range_order_dict = { } for feature, d in feature_value_order_dict.items(): if 'time' in feature: continue normal_range = vital_normal_range_dict[feature] values = sorted(d.keys(), key = lambda s:float(s)) feature_normal_range_order_dict[feature] = [] for v in values: if float(v) > normal_range[0] and len(feature_normal_range_order_dict[feature]) == 0: feature_normal_range_order_dict[feature].append(d[v]) if float(v) > normal_range[1] and len(feature_normal_range_order_dict[feature]) == 1: feature_normal_range_order_dict[feature].append(d[v]) break print(feature_normal_range_order_dict) py_op.mywritejson(os.path.join(args.result_dir, 'feature_normal_range_order_dict.json'), feature_normal_range_order_dict)
def get_cases(): sepsis_label_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_label_dict.json')) print(len(sepsis_label_dict)) icu_file = '../data/icustays.csv' print('reading icustays.csv') icu_data = pd.read_csv(icu_file) icu_adm_dict = dict() icu_intime_dict = dict() for iline in range(len(icu_data)): icu = icu_data.loc[iline, 'icustay_id'] intime = icu_data.loc[iline, 'intime'] adm = icu_data.loc[iline, 'hadm_id'] icu_adm_dict[icu] = adm icu_intime_dict[icu] = time_to_min(intime) sepsis_label_dict = { k:0 for k in sepsis_label_dict } sepsis_time_dict = py_op.myreadjson(os.path.join(args.result_dir, 'sepsis_time_dict.json')) for iline, line in enumerate(open('../data/sepsis_onset_time.csv')): icustay_id, h = line.strip().split(',') adm = icu_adm_dict[int(icustay_id)] sepsis_label_dict[adm] = 1 time = icu_intime_dict[int(icustay_id)] + 60 * int(h) sepsis_time_dict[adm] = time for iline, line in enumerate(open('../data/sepsis3_cases.csv')): break if iline: icustay_id,intime,outtime,length_of_stay,delta_score,sepsis_onset,sepsis_onset_day,sepsis_onset_hour = line.strip().split(',') adm = icu_adm_dict[int(icustay_id)] sepsis_label_dict[adm] = 1 time = time_to_min(sepsis_onset) sepsis_time_dict[adm] = time py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_label_dict.json'), sepsis_label_dict) py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'), sepsis_time_dict)
def gen_sepsis_label_dict(): sepsis_label_dict = dict() sepsis_file = '../data/sepsis3.csv' print('reading sepsis3.csv') sepsis_data = pd.read_csv(sepsis_file) sepsis_infection_dict = dict() sepsis_set = set() for iline in range(len(sepsis_data)): adm = sepsis_data.loc[iline, 'hadm_id'] adm = str(adm) excluded = sepsis_data.loc[iline, 'excluded'] suspected_infection_time_poe = sepsis_data.loc[ iline, 'suspected_infection_time_poe'] if len(str(suspected_infection_time_poe)) > 5: sepsis_infection_dict[adm] = time_to_min( suspected_infection_time_poe) # sepsis_set.add(adm) if excluded == 0: sepsis_set.add(adm) # if len(str(suspected_infection_time_poe)) > 5: # sepsis_infection_dict[adm] = time_to_min(suspected_infection_time_poe) # print(suspected_infection_time_poe) # if excluded == 0 and len(str(suspected_infection_time_poe)) > 0: # sepsis_set.add(adm) # return # print(len(sepsis_infection_dict)) # print(len(sepsis_set)) print('Infection No: {:d}'.format(len(sepsis_infection_dict))) print('Sepsis No: {:d}'.format(len(sepsis_set))) # py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'), patient_label_dict) # return icu_file = '../data/icustays.csv' print('reading icustays.csv') icu_data = pd.read_csv(icu_file) icu_adm_dict = dict() for iline in range(len(icu_data)): icu = icu_data.loc[iline, 'icustay_id'] adm = icu_data.loc[iline, 'hadm_id'] icu_adm_dict[icu] = adm sofa_file = '../data/sofa.csv' print('reading sofa.csv') sofa_data = pd.read_csv(sofa_file) print('mapping sofa to adm') adm_sofa_dict = dict() for iline in range(len(sofa_data)): break if iline and iline % 10000 == 0: print('mapping sofa to adm', iline, len(sofa_data)) icu = sofa_data.loc[iline, 'icustay_id'] sofa = sofa_data.loc[iline, 'sofa_24hours'] starttime = sofa_data.loc[iline, 'starttime'] endtime = sofa_data.loc[iline, 'endtime'] adm = icu_adm_dict[icu] adm_sofa_dict[adm] = adm_sofa_dict.get( adm, []) + [[sofa, starttime, endtime]] # py_op.mywritejson('../result/adm_sofa_dict.json', adm_sofa_dict) # return adm_sofa_dict = py_op.myreadjson('../result/adm_sofa_dict.json') print('set sepsis label') pos_num = 0 for iline, (adm, sofa_list) in enumerate(adm_sofa_dict.items()): # print(adm, type(adm)) if iline and iline % 10000 == 0: print('set sepsis label', iline, len(adm_sofa_dict)) # if adm not in sepsis_infection_dict: if adm in sepsis_infection_dict: sepsis_label_dict[adm] = [0, sepsis_infection_dict[adm]] else: continue if adm not in sepsis_set: continue # sofa_list = sofa_list # if time_to_min(sofa_list[0][1]) < sepsis_infection_dict[adm] : # continue # print('have data') sofa_init = '' for sofa in sofa_list: starttime = sofa[1] endtime = sofa[2] time = time_to_min(endtime) sofa = int(sofa[0]) if time - sepsis_infection_dict[ adm] >= -48 * 60 and time - sepsis_infection_dict[ adm] <= 24 * 60: if sofa_init == '': sofa_init = sofa elif sofa - sofa_init >= 2 and sofa >= 2: sepsis_label_dict[adm] = [1, sepsis_infection_dict[adm]] sepsis_infection_dict[adm] = max( time, sepsis_infection_dict[adm]) pos_num += 1 break print('writing sepsis_label_dict') py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_time_dict.json'), sepsis_infection_dict) py_op.mywritejson(os.path.join(args.result_dir, 'sepsis_label_dict.json'), {k: v[0] for k, v in sepsis_label_dict.items()}) print('There are {:d} positive samples.'.format(pos_num)) print('There are {:d} negtive samples.'.format( len(sepsis_label_dict) - pos_num))
def gen_feature_order_dict(): ''' generate the order of value for each feature ''' feature_value_order_dict = dict() # vital information vital_file = args.vital_file vital_dict = {} # key-valuelist-dict for i_line, line in enumerate(open(vital_file)): if i_line % 10000 == 0: print i_line # if i_line > 10000: # break if i_line == 0: new_line = '' vis = 0 for c in line: if c == '"': vis = (vis + 1) % 2 if vis == 1 and c == ',': c = ';' new_line += c line = new_line col_list = line.strip().split(',')[1:] for col in col_list: vital_dict[col] = [] else: ctt_list = line.strip().split(',')[1:] assert len(ctt_list) == len(col_list) for col, ctt in zip(col_list, ctt_list): if len(ctt): vital_dict[col].append(float(ctt)) # if i_line > 10000: # break # if i_line % 10000 == 0: # print i_line # add group info groups = py_op.myreadjson(os.path.join(args.file_dir, 'similar.json')) feature_index_dict = py_op.myreadjson( os.path.join(args.file_dir, 'feature_index_dict.json')) index_feature_list = py_op.myreadjson( os.path.join(args.file_dir, 'index_feature_list.json')) for g in groups: for k in g: mg = min(g) if k != mg: kf = index_feature_list[k] mf = index_feature_list[mg] vital_dict[mf] = vital_dict[mf] + vital_dict[kf] vital_dict.pop(kf) print 'features', len(vital_dict) # feature_count_dict = { k: len(v) for k,v in vital_dict.items() } # py_op.mywritejson(os.path.join(args.file_dir, 'feature_count_dict.json'), feature_count_dict) ms_list = [] for col in col_list: if col not in vital_dict: continue value_list = sorted(vital_dict[col]) value_order_dict = dict() value_minorder_dict = dict() value_maxorder_dict = dict() for i_value, value in enumerate(value_list): if value not in value_minorder_dict: value_minorder_dict[value] = i_value if value == value_list[-1]: value_maxorder_dict[value] = len(value_list) - 1 break if value != value_list[i_value + 1]: value_maxorder_dict[value] = i_value for value in value_maxorder_dict: value_order_dict[value] = ( value_maxorder_dict[value] + value_minorder_dict[value]) / 2.0 / len(value_list) feature_value_order_dict[col] = value_order_dict py_op.mywritejson( os.path.join(args.file_dir, 'feature_value_order_dict.json'), feature_value_order_dict)
def gen_feature_order_dict(): ''' generate the order of value for each feature ''' feature_value_order_dict = dict() # vital information vital_file = args.vital_file vital_dict = { } # key-valuelist-dict for i_line,line in enumerate(open(vital_file)): line = line.strip().replace('"', '') if i_line % 10000 == 0: print(i_line) # if i_line > 10000: # break if i_line == 0: new_line = '' vis = 0 for c in line: if c == '"': vis = (vis + 1) % 2 if vis == 1 and c == ',': c = ';' new_line += c line = new_line col_list = line.strip().split(',')[1:] for col in col_list: vital_dict[col] = [] else: ctt_list = line.strip().split(',')[1:] ctt_list[0] = str(time_to_min(ctt_list[0])) assert len(ctt_list) == len(col_list) for col,ctt in zip(col_list, ctt_list): if len(ctt): vital_dict[col].append(ctt) # if i_line > 10000: # break # if i_line % 10000 == 0: # print(i_line) feature_count_dict = { k: len(v) for k,v in vital_dict.items() } py_op.mywritejson(os.path.join(args.result_dir, 'feature_count_dict.json'), feature_count_dict) ms_list = [] for col in col_list: if col not in vital_dict: continue value_list = sorted(vital_dict[col], key=lambda s:float(s)) value_order_dict = dict() value_minorder_dict = dict() value_maxorder_dict = dict() for i_value, value in enumerate(value_list): if value not in value_minorder_dict: value_minorder_dict[value] = i_value if value == value_list[-1]: value_maxorder_dict[value] = len(value_list) - 1 break if value != value_list[i_value+1]: value_maxorder_dict[value] = i_value for value in value_maxorder_dict: value_order_dict[value] = (value_maxorder_dict[value] + value_minorder_dict[value]) / 2.0 / len(value_list) feature_value_order_dict[col] = value_order_dict py_op.mywritejson(os.path.join(args.result_dir, 'feature_value_order_dict.json'), feature_value_order_dict)