Пример #1
0
def predicted_time_horizon():
    """
    分布图——提前预测时长,对应正文的Figure 3中的B C

    孙川  2020.04.02
    """
    #读取375和110的病人数据,并限定特征为['乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)', '出院时间', '出院方式']
    data1 = pd.read_parquet('data/time_series_375.parquet')[[
        '乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)', '出院时间', '出院方式'
    ]]
    data2 = pd.read_parquet('data/time_series_test_110.parquet')[[
        '乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)', '出院时间', '出院方式'
    ]]
    #for 循环,分别画正文Figure 3的图B,即375+110=485的图;和图C 110病人的图
    for data in [utils.concat_data(data1, data2), data2]:
        # 将3特征存在缺省的样本删掉
        data = data.dropna(how='all', subset=['乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)'])

        # 滑窗合并数据
        data = utils.merge_data_by_sliding_window(data,
                                                  n_days=1,
                                                  dropna=True,
                                                  subset=utils.top3_feats_cols,
                                                  time_form='diff')
        # 根据一级(PATINET_ID)和二级索引(距离出院的时间)排序
        data = data.sort_index(level=(0, 1))

        # 调用apply方法对每个样本进行判断 即 论文决策树预测
        data['pred'] = data.apply(utils.decision_tree, axis=1)

        # 统计提前预测时长
        time_advance = utils.get_time_in_advance_of_predict(
            data)['time_advance']

        # Figure
        plt.figure(dpi=200)
        plt.hist(time_advance, bins=100)
        plt.title('Predicted time horizon', fontdict=font)
        plt.xticks(fontsize=font['size'])
        plt.yticks(fontsize=font['size'])
        plt.xlabel('days to outcome', fontdict=font)
        plt.ylabel('Frequency', fontdict=font)
        x_max = plt.gca().get_xlim()[1]
        y_max = plt.gca().get_ylim()[1]
        plt.text(
            0.65 * x_max,
            0.75 * y_max,
            f"mean {np.mean(time_advance):.2f}\nstd     {np.std(time_advance):.2f}",
            fontdict=font)
        plt.show()
Пример #2
0
def decision_tree_top3_feats_predict_result():
    """
    用所有次的 ['乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)'] 进行预测

    孙川  2020.04.07
    """
    # 决定是否使用每个病人最后一天的数据
    last_sample = False

    data1 = pd.read_parquet('data/time_series_375.parquet')[[
        '乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)', '出院时间', '出院方式'
    ]]
    data2 = pd.read_parquet('data/time_series_test_110.parquet')[[
        '乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)', '出院时间', '出院方式'
    ]]

    # data1是375 data2是110 concat是485
    for data in [data1, data2, utils.concat_data(data1, data2)]:
        # 滑窗合并数据
        data = utils.merge_data_by_sliding_window(data,
                                                  n_days=1,
                                                  dropna=True,
                                                  subset=utils.top3_feats_cols,
                                                  time_form='diff')

        #是否使用最后一次的数据,因为merge_data_by_sliding_window中last自带升序因此这里取first()
        #groupby后,每个病人的索引是二级索引t_diff,是升序
        if last_sample:
            data = data.groupby('PATIENT_ID').first()

        # 论文决策树预测
        data['pred'] = data.apply(utils.decision_tree, axis=1)

        # 调用自己写的结果统计方式utils.Metrics
        metrics = utils.Metrics(acc='overall',
                                f1='overall',
                                conf_mat='overall',
                                report='overall')
        metrics.record(data['出院方式'], data['pred'])
        metrics.print_metrics()
Пример #3
0
def main():
    data_path = Path('./data')
    data = concat_data(data_path, '*L6.nc*')
    plot_minmaxmean(data, 7)
Пример #4
0
                w.writerow(row[1:3] + [row[-1], '', ''])
dfl = pd.read_csv('%s/ALL_CODES_filtered.csv' % mimic3_folder, index_col=None)
len(dfl['HADM_ID'].unique())
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])
dfl.to_csv('%s/ALL_CODES_filtered.csv' % mimic3_folder, index=False)
df.to_csv('%s/disch_full.csv' % mimic3_folder, index=False)
print('STEP 4. filter ALL_CODES takes {} seconds'.format(int(time() - t1)))
## 47 seconds

####  5. concatenate

t1 = time()
df = pd.read_csv('%s/disch_full.csv' % mimic3_folder)
sorted_file = '%s/disch_full.csv' % mimic3_folder
df.to_csv(sorted_file, index=False)
labeled = utils.concat_data('%s/ALL_CODES_filtered.csv' % mimic3_folder,
                            sorted_file)  ## 52727
### labeled is data/mimic3/notes_labeled.csv => output
print('STEP 5:concatenate takes {} seconds'.format(int(time() - t1)))

### 63 seconds

####################################################
#### FLAG notes_labeled.csv
####################################################

####  6.  compute num of word for notes_labeled.csv

t1 = time()
labeled = '{}/notes_labeled.csv'.format(mimic3_folder)
dfnl = pd.read_csv(labeled)
#Tokens and types
            hadm_id = int(row[2])
            #print(hadm_id)
            #break
            if hadm_id in hadm_ids:
                w.writerow(row[1:3] + [row[-1], '', ''])

dfl = pd.read_csv('%s/ALL_CODES_filtered.csv' % args.MIMIC_3_DIR, index_col=None)

dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])
dfl.to_csv('%s/ALL_CODES_filtered.csv' % args.MIMIC_3_DIR, index=False)

sorted_file = '%s/disch_full.csv' % args.MIMIC_3_DIR
df.to_csv(sorted_file, index=False)

# step 4: link notes with their code
labeled = concat_data('%s/ALL_CODES_filtered.csv' % args.MIMIC_3_DIR, sorted_file, '%s/notes_labeled.csv' % args.MIMIC_3_DIR)

dfnl = pd.read_csv(labeled)

# step 5: statistic unique word, total word, HADM_ID number
types = set()
num_tok = 0
for row in dfnl.itertuples():
    for w in row[3].split():
        types.add(w)
        num_tok += 1

print("num types", len(types), "num tokens", num_tok)
print("HADM_ID: {}".format(len(dfnl['HADM_ID'].unique())))
print("SUBJECT_ID: {}".format(len(dfnl['SUBJECT_ID'].unique())))