def predicted_time_horizon(): """ 分布图——提前预测时长,对应正文的Figure 3中的B C 孙川 2020.04.02 """ #读取375和110的病人数据,并限定特征为['乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)', '出院时间', '出院方式'] data1 = pd.read_parquet('data/time_series_375.parquet')[[ '乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)', '出院时间', '出院方式' ]] data2 = pd.read_parquet('data/time_series_test_110.parquet')[[ '乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)', '出院时间', '出院方式' ]] #for 循环,分别画正文Figure 3的图B,即375+110=485的图;和图C 110病人的图 for data in [utils.concat_data(data1, data2), data2]: # 将3特征存在缺省的样本删掉 data = data.dropna(how='all', subset=['乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)']) # 滑窗合并数据 data = utils.merge_data_by_sliding_window(data, n_days=1, dropna=True, subset=utils.top3_feats_cols, time_form='diff') # 根据一级(PATINET_ID)和二级索引(距离出院的时间)排序 data = data.sort_index(level=(0, 1)) # 调用apply方法对每个样本进行判断 即 论文决策树预测 data['pred'] = data.apply(utils.decision_tree, axis=1) # 统计提前预测时长 time_advance = utils.get_time_in_advance_of_predict( data)['time_advance'] # Figure plt.figure(dpi=200) plt.hist(time_advance, bins=100) plt.title('Predicted time horizon', fontdict=font) plt.xticks(fontsize=font['size']) plt.yticks(fontsize=font['size']) plt.xlabel('days to outcome', fontdict=font) plt.ylabel('Frequency', fontdict=font) x_max = plt.gca().get_xlim()[1] y_max = plt.gca().get_ylim()[1] plt.text( 0.65 * x_max, 0.75 * y_max, f"mean {np.mean(time_advance):.2f}\nstd {np.std(time_advance):.2f}", fontdict=font) plt.show()
def decision_tree_top3_feats_predict_result(): """ 用所有次的 ['乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)'] 进行预测 孙川 2020.04.07 """ # 决定是否使用每个病人最后一天的数据 last_sample = False data1 = pd.read_parquet('data/time_series_375.parquet')[[ '乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)', '出院时间', '出院方式' ]] data2 = pd.read_parquet('data/time_series_test_110.parquet')[[ '乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)', '出院时间', '出院方式' ]] # data1是375 data2是110 concat是485 for data in [data1, data2, utils.concat_data(data1, data2)]: # 滑窗合并数据 data = utils.merge_data_by_sliding_window(data, n_days=1, dropna=True, subset=utils.top3_feats_cols, time_form='diff') #是否使用最后一次的数据,因为merge_data_by_sliding_window中last自带升序因此这里取first() #groupby后,每个病人的索引是二级索引t_diff,是升序 if last_sample: data = data.groupby('PATIENT_ID').first() # 论文决策树预测 data['pred'] = data.apply(utils.decision_tree, axis=1) # 调用自己写的结果统计方式utils.Metrics metrics = utils.Metrics(acc='overall', f1='overall', conf_mat='overall', report='overall') metrics.record(data['出院方式'], data['pred']) metrics.print_metrics()
def main(): data_path = Path('./data') data = concat_data(data_path, '*L6.nc*') plot_minmaxmean(data, 7)
w.writerow(row[1:3] + [row[-1], '', '']) dfl = pd.read_csv('%s/ALL_CODES_filtered.csv' % mimic3_folder, index_col=None) len(dfl['HADM_ID'].unique()) dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID']) dfl.to_csv('%s/ALL_CODES_filtered.csv' % mimic3_folder, index=False) df.to_csv('%s/disch_full.csv' % mimic3_folder, index=False) print('STEP 4. filter ALL_CODES takes {} seconds'.format(int(time() - t1))) ## 47 seconds #### 5. concatenate t1 = time() df = pd.read_csv('%s/disch_full.csv' % mimic3_folder) sorted_file = '%s/disch_full.csv' % mimic3_folder df.to_csv(sorted_file, index=False) labeled = utils.concat_data('%s/ALL_CODES_filtered.csv' % mimic3_folder, sorted_file) ## 52727 ### labeled is data/mimic3/notes_labeled.csv => output print('STEP 5:concatenate takes {} seconds'.format(int(time() - t1))) ### 63 seconds #################################################### #### FLAG notes_labeled.csv #################################################### #### 6. compute num of word for notes_labeled.csv t1 = time() labeled = '{}/notes_labeled.csv'.format(mimic3_folder) dfnl = pd.read_csv(labeled) #Tokens and types
hadm_id = int(row[2]) #print(hadm_id) #break if hadm_id in hadm_ids: w.writerow(row[1:3] + [row[-1], '', '']) dfl = pd.read_csv('%s/ALL_CODES_filtered.csv' % args.MIMIC_3_DIR, index_col=None) dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID']) dfl.to_csv('%s/ALL_CODES_filtered.csv' % args.MIMIC_3_DIR, index=False) sorted_file = '%s/disch_full.csv' % args.MIMIC_3_DIR df.to_csv(sorted_file, index=False) # step 4: link notes with their code labeled = concat_data('%s/ALL_CODES_filtered.csv' % args.MIMIC_3_DIR, sorted_file, '%s/notes_labeled.csv' % args.MIMIC_3_DIR) dfnl = pd.read_csv(labeled) # step 5: statistic unique word, total word, HADM_ID number types = set() num_tok = 0 for row in dfnl.itertuples(): for w in row[3].split(): types.add(w) num_tok += 1 print("num types", len(types), "num tokens", num_tok) print("HADM_ID: {}".format(len(dfnl['HADM_ID'].unique()))) print("SUBJECT_ID: {}".format(len(dfnl['SUBJECT_ID'].unique())))