def generate_diagnosis_data():
    sid_hadm_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'sid_hadm_dict.json'))
    hadm_sid_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_sid_dict.json'))

    hadm_map_dict = dict()
    for hadm in hadm_sid_dict:
        sid = hadm_sid_dict[hadm]
        hadm_list = sid_hadm_dict[sid]
        if len(hadm_list) > 1:
            hadm_list = sorted(hadm_list, key=lambda k: int(k))
            idx = hadm_list.index(hadm)
            if idx > 0:
                for h in hadm_list[:idx]:
                    if h not in hadm_map_dict:
                        hadm_map_dict[h] = []
                    hadm_map_dict[h].append(hadm)

    hadm_icd_dict = dict()
    for i_line, line in enumerate(
            open(os.path.join(args.mimic_dir, 'DIAGNOSES_ICD.csv'))):
        if i_line:
            if i_line % 10000 == 0:
                print(i_line)
            line_data = [x.strip('"') for x in py_op.csv_split(line.strip())]
            ROW_ID, SUBJECT_ID, hadm_id, SEQ_NUM, icd = line_data
            if hadm_id in hadm_map_dict:
                for h in hadm_map_dict[hadm_id]:
                    if h not in hadm_icd_dict:
                        hadm_icd_dict[h] = []
                    hadm_icd_dict[h].append(icd)
    hadm_icd_dict = {h: list(set(icds)) for h, icds in hadm_icd_dict.items()}
    py_op.mywritejson(os.path.join(args.data_dir, 'hadm_icd_dict.json'),
                      hadm_icd_dict)
示例#2
0
def stat_drug_effect():
    for fi in ['train.json', 'valid.json', 'test.json']:
        ehr_data = json.load(
            open(os.path.join(args.data_dir, args.dataset, fi)))
        new_ehr_data = []
        has_drug = []
        has_hf = [[], []]
        for pdata in ehr_data:
            patient_dict = pdata[0]
            hf = pdata[1]
            vis, new_patient_dict = find_drug(patient_dict)
            if len(new_patient_dict):
                has_drug.append(vis)
                has_hf[vis].append(hf)
                new_ehr_data.append([new_patient_dict, hf])
        print('')
        print('In {:s}:'.format(fi.split('.')[0]))
        print(
            'There are {:d} patients. {:d} patients has drug. {:d} patients hasn\'t drugs.'
            .format(len(new_ehr_data), sum(has_drug),
                    len(ehr_data) - sum(has_drug)))
        print(
            'Drug patients with hf: {:3.4f}. No drug patients with hf: {:3.4}.'
            .format(np.mean(has_hf[1]), np.mean(has_hf[0])))
        py_op.mywritejson(
            os.path.join(args.data_dir, args.dataset, 'new_' + fi),
            new_ehr_data)
示例#3
0
def main():
    dataset = data_loader.DataBowl(args, phase='train')
    train_loader = DataLoader(dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.workers,
                              pin_memory=True)
    dataset = data_loader.DataBowl(args, phase='valid')
    valid_loader = DataLoader(dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.workers,
                              pin_memory=True)
    args.vocab = dataset.vocab
    args.relation = dataset.relation

    # net, loss = model.Net(args), model.Loss()
    net, loss = model.FCModel(args), model.Loss()

    net = _cuda(net, 0)
    loss = _cuda(loss, 0)

    parameters_all = []
    for p in net.parameters():
        parameters_all.append(p)

    optimizer = torch.optim.Adam(parameters_all, args.lr)

    best_auc = [0, 0, 0, 0, 0, 0]

    cui_con_dict = {}
    if args.phase == 'train':
        for epoch in range(args.epochs):
            train(train_loader, net, loss, epoch, optimizer, best_auc)
            best_auc, cui_con_dict = test(valid_loader, net, loss, epoch,
                                          best_auc, 'valid', cui_con_dict)
            print args.words

        if 1:
            cons_dir = '../result/cons/{:s}/{:d}'.format(
                args.model, args.predict_day)
            py_op.mkdir(cons_dir)
            num = len(os.listdir(cons_dir))
            py_op.mywritejson(os.path.join(cons_dir, '{:d}.json'.format(num)),
                              cui_con_dict)
            # break

        print 'best auc', best_auc
        auc = best_auc[0]
        with open('../result/log.txt', 'a') as f:
            f.write('#model {:s} #auc {:3.4f}\n'.format(args.model, auc))

    elif args.phase == 'test':
        net.load_state_dict(torch.load(args.resume))
        test(valid_loader, net, loss, 0, best_auc, 'valid', cui_con_dict)
示例#4
0
def generate_icu_mortality_dict(icustay_id_list):
    icu_mortality_dict = dict()
    for i_line, line in enumerate(
            open(os.path.join(args.mimic_dir, 'sepsis_mortality.csv'))):
        if i_line:
            if i_line % 10000 == 0:
                print(i_line)
            line_data = line.strip().split(',')
            icustay_id = line_data[0]
            icu_mortality_dict[icustay_id] = int(line_data[-1])
    py_op.mywritejson(os.path.join(args.data_dir, 'icu_mortality_dict.json'),
                      icu_mortality_dict)
def map_ehr_id():
    print('start')
    ehr_count_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'ehr_count_dict.json'))
    ehr_list = [ehr for ehr, c in ehr_count_dict.items() if c > 100]
    ns = set('0123456789')
    print(ns)
    drug_list = [e for e in ehr_list if e[1] in ns]
    med_list = [e for e in ehr_list if e[1] not in ns]
    print(len(drug_list))
    print(len(med_list))
    py_op.mywritejson(os.path.join(args.data_dir, 'ehr_list.json'), ehr_list)
示例#6
0
def split_data_to_ten_set():
    files = sorted(
        glob(os.path.join(args.data_dir, args.dataset,
                          'train_with_missing/*')))
    np.random.shuffle(files)
    splits = []
    for i in range(10):
        st = int(len(files) * i / 10)
        en = int(len(files) * (i + 1) / 10)
        splits.append(files[st:en])
    py_op.mywritejson(
        os.path.join(args.file_dir, args.dataset + '_splits.json'), splits)
示例#7
0
def compute_pred_clean_psnr(epoch='ensemble',
                            clean_dir='../../data/AI/testB/',
                            result_dir='../../data/result'):
    psnr_list = []
    file_psnr_dict = dict()
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)
    f = open(os.path.join(result_dir, epoch + '.csv'), 'w')
    for i, clean in enumerate(os.listdir(clean_dir)):
        file = clean
        clean = os.path.join(clean_dir, clean)
        clean_file = clean
        pred = clean.replace('.jpg', '.png').replace(
            'data', 'data/test_clean/{:s}'.format(epoch))
        stain = clean.replace('trainB', 'trainA').replace('testB',
                                                          'testA').replace(
                                                              '.jpg', '_.jpg')
        try:
            pred = Image.open(pred)
            pred = pred.resize((256, 256))
            pred = pred.resize((250, 250))
            pred = np.array(pred).astype(np.float32)
            clean = np.array(Image.open(clean)).astype(np.float32)
            stain = np.array(Image.open(stain)).astype(np.float32)
            psnr_pred = psnr(clean, pred)
            psnr_stain = psnr(clean, stain)
            psnr_list.append([psnr_stain, psnr_pred])
            file_psnr_dict[file] = psnr_pred
        except:
            traceback.print_exc()
            continue
        print i, 1000
        f.write(clean_file.split('/')[-1])
        f.write(',')
        f.write(str(psnr_stain))
        f.write(',')
        f.write(str(psnr_pred))
        f.write(',')
        f.write(str(psnr_pred / psnr_stain - 1))
        f.write('\n')
    psnr_list = np.array(psnr_list)
    psnr_mean = ((psnr_list[:, 1] - psnr_list[:, 0]) / psnr_list[:, 0]).mean()
    f.write(str(psnr_mean))
    f.close()
    py_op.mywritejson(
        os.path.join(result_dir, epoch + '.json'),
        py_op.mysorteddict(file_psnr_dict, key=lambda s: file_psnr_dict[s]))
    print '网纹图PSNR', psnr_list[:, 0].mean()
    print '预测图PSNR', psnr_list[:, 1].mean()
    print '增益率', psnr_mean
    return psnr_mean
def generate_ehr_files():

    hadm_time_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_time_dict.json'))
    hadm_demo_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_demo_dict.json'))
    hadm_sid_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_sid_dict.json'))
    hadm_icd_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_icd_dict.json'))
    hadm_time_drug_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_time_drug_dict.json'))
    groundtruth_dir = os.path.join(args.data_dir, 'train_groundtruth')
    py_op.mkdir(groundtruth_dir)
    ehr_count_dict = dict()

    for hadm_id in hadm_sid_dict:

        time_drug_dict = hadm_time_drug_dict.get(hadm_id, {})
        icd_list = hadm_icd_dict.get(hadm_id, [])
        demo = hadm_demo_dict[hadm_id]
        demo[0] = demo[0] + '1'
        demo[1] = 'A' + str(int(demo[1] / 9))
        icd_demo = icd_list + demo

        for icd in icd_demo:
            ehr_count_dict[icd] = ehr_count_dict.get(icd, 0) + 1

        ehr_dict = {'drug': {}, 'icd_demo': icd_demo}

        for setime, drug_list in time_drug_dict.items():
            try:
                stime, etime = setime.split(' -- ')
                start_second = time_to_second(hadm_time_dict[hadm_id])
                stime = str((time_to_second(stime) - start_second) / 3600)
                etime = str((time_to_second(etime) - start_second) / 3600)
                setime = stime + ' -- ' + etime
                for drug in drug_list:
                    ehr_count_dict[drug] = ehr_count_dict.get(drug, 0) + 1
                ehr_dict['drug'][setime] = list(set(drug_list))
            except:
                pass

        py_op.mywritejson(os.path.join(groundtruth_dir, hadm_id + '.json'),
                          ehr_dict)
        # break
    py_op.mywritejson(os.path.join(args.data_dir, 'ehr_count_dict.json'),
                      ehr_count_dict)
示例#9
0
def generate_feature_mm_dict():
    files = sorted(
        glob(os.path.join(args.data_dir, args.dataset, 'train_groundtruth/*')))
    feature_value_dict = dict()
    for ifi, fi in enumerate(tqdm(files)):
        if 'csv' not in fi:
            continue
        for iline, line in enumerate(open(fi)):
            line = line.strip()
            if iline == 0:
                feat_list = line.split(',')
            else:
                data = line.split(',')
                for iv, v in enumerate(data):
                    if v in ['NA', '']:
                        continue
                    else:
                        feat = feat_list[iv]
                        if feat not in feature_value_dict:
                            feature_value_dict[feat] = []
                        feature_value_dict[feat].append(float(v))
    feature_mm_dict = dict()
    feature_ms_dict = dict()

    feature_range_dict = dict()
    for feat, vs in feature_value_dict.items():
        vs = sorted(vs)
        value_split = []
        for i in range(args.split_num):
            n = int(i * len(vs) / args.split_num)
            value_split.append(vs[n])
        value_split.append(vs[-1])
        feature_range_dict[feat] = value_split

        n = int(len(vs) / args.split_num)
        feature_mm_dict[feat] = [vs[n], vs[-n - 1]]
        feature_ms_dict[feat] = [np.mean(vs), np.std(vs)]

    py_op.mkdir(args.file_dir)
    py_op.mywritejson(
        os.path.join(args.file_dir, args.dataset + '_feature_mm_dict.json'),
        feature_mm_dict)
    py_op.mywritejson(
        os.path.join(args.file_dir, args.dataset + '_feature_ms_dict.json'),
        feature_ms_dict)
    py_op.mywritejson(
        os.path.join(args.file_dir, args.dataset + '_feature_list.json'),
        feat_list)
    py_op.mywritejson(
        os.path.join(
            args.file_dir, args.dataset +
            '_feature_value_dict_{:d}.json'.format(args.split_num)),
        feature_range_dict)
示例#10
0
def wkmeans(n_cluster):
    subtyping_dir = os.path.join(args.result_dir, args.dataset, 'subtyping')
    hadm_id_list = py_op.myreadjson(os.path.join(subtyping_dir, 'hadm_id_list.json'))
    hadm_dist_matrix = np.load(os.path.join(subtyping_dir, 'hadm_dist_matrix.npy'))
    assert len(hadm_dist_matrix) == len(hadm_id_list)

    # initialization
    indices = range(len(hadm_id_list))
    np.random.shuffle(indices)
    init_groups = [indices[i*10: i*10 + 10] for i in range(n_cluster)]

    groups = init_groups
    for epoch in range(100):
        groups = wkmeans_epoch(hadm_dist_matrix, groups)
        print([len(g) for g in groups])
        if epoch and epoch % 10 == 0:
            cluster_results = []
            for g in groups:
                cluster_results.append([hadm_id_list[i] for i in g])
            py_op.mywritejson(os.path.join(subtyping_dir, 'cluster_results.json'), cluster_results)
示例#11
0
def generate_drug_data():
    hadm_sid_dict = py_op.myreadjson(
        os.path.join(args.data_dir, 'hadm_sid_dict.json'))
    hadm_id_set = set(hadm_sid_dict)
    hadm_time_drug_dict = dict()
    for i_line, line in enumerate(
            open(os.path.join(args.mimic_dir, 'PRESCRIPTIONS.csv'))):
        if i_line:
            if i_line % 10000 == 0:
                print(i_line)
            line_data = [x.strip('"') for x in py_op.csv_split(line.strip())]
            _, SUBJECT_ID, hadm_id, _, startdate, enddate, _, drug, DRUG_NAME_POE, DRUG_NAME_GENERIC, FORMULARY_DRUG_CD, gsn, ndc, PROD_STRENGTH, DOSE_VAL_RX, DOSE_UNIT_RX, FORM_VAL_DISP, FORM_UNIT_DISP, ROUTE = line_data
            if len(hadm_id) and hadm_id in hadm_id_set:
                if hadm_id not in hadm_time_drug_dict:
                    hadm_time_drug_dict[hadm_id] = dict()
                time = startdate + ' -- ' + enddate
                if time not in hadm_time_drug_dict[hadm_id]:
                    hadm_time_drug_dict[hadm_id][time] = []
                hadm_time_drug_dict[hadm_id][time].append(drug)
                # hadm_time_drug_dict[hadm_id][time].append(ndc)
    py_op.mywritejson(os.path.join(args.data_dir, 'hadm_time_drug_dict.json'),
                      hadm_time_drug_dict)
示例#12
0
def generate_demo():
    icu_hadm_dict = py_op.myreadjson('../../src/icu_hadm_dict.json')
    py_op.mywritejson(os.path.join(args.data_dir, 'icu_hadm_dict.json'),
                      icu_hadm_dict)

    sid_demo_dict = dict()
    sid_hadm_dict = dict()
    for i_line, line in enumerate(
            open(os.path.join(args.mimic_dir, 'PATIENTS.csv'))):
        if i_line:
            data = line.split(',')
            sid = data[1]
            gender = data[2].replace('"', '')
            dob = data[3][:4]
            sid_demo_dict[sid] = [gender, int(dob)]
    py_op.mywritejson(os.path.join(args.data_dir, 'sid_demo_dict.json'),
                      sid_demo_dict)

    hadm_sid_dict = dict()
    hadm_demo_dict = dict()
    hadm_time_dict = dict()
    for i_line, line in enumerate(
            open(os.path.join(args.mimic_dir, 'ICUSTAYS.csv'))):
        if i_line:
            line = line.replace('"', '')
            data = line.split(',')
            sid = data[1]
            hadm_id = data[2]
            icu_id = data[3]
            intime = data[-3]
            sid_hadm_dict[sid] = sid_hadm_dict.get(sid, []) + [hadm_id]
            if icu_id not in icu_hadm_dict:
                continue
            hadm_sid_dict[hadm_id] = sid
            gender = sid_demo_dict[sid][0]
            dob = sid_demo_dict[sid][1]
            age = int(intime[:4]) - dob
            if age < 18:
                print(age)
            assert age >= 18
            if age > 150:
                age = 90
            hadm_demo_dict[hadm_id] = [gender, age]
            hadm_time_dict[hadm_id] = intime
    py_op.mywritejson(os.path.join(args.data_dir, 'hadm_demo_dict.json'),
                      hadm_demo_dict)
    py_op.mywritejson(os.path.join(args.data_dir, 'hadm_time_dict.json'),
                      hadm_time_dict)
    py_op.mywritejson(os.path.join(args.data_dir, 'sid_hadm_dict.json'),
                      sid_hadm_dict)
    py_op.mywritejson(os.path.join(args.data_dir, 'hadm_sid_dict.json'),
                      hadm_sid_dict)
示例#13
0
def compute_dist_mat():
    files = glob(
        os.path.join(args.result_dir, args.dataset,
                     'imputation_result/*.csv'))  # [:100]
    feature_ms_dict = py_op.myreadjson(
        os.path.join(args.file_dir, args.dataset + '_feature_ms_dict.json'))
    subtyping_dir = os.path.join(args.result_dir, args.dataset, 'subtyping')
    py_op.mkdir(subtyping_dir)
    hadm_id_list = []
    mean_variables = []
    hadm_variable_dict = {}
    all_values = []

    for i_fi, fi in enumerate(tqdm(files)):
        hadm_id = fi.split('/')[-1].split('.')[0]
        hadm_data = []
        for i_line, line in enumerate(open(fi)):
            if i_line:
                line_data = line.strip().split(',')
                line_data = np.array([float(x) for x in line_data])
                if len(line_data) != n_variables + 1:
                    print(i_fi, fi)
                if line_data[0] < 0:
                    continue
                elif line_data[0] < 24:
                    hadm_data.append(line_data)
                else:
                    break
            else:
                head = line.strip().split(',')[1:]
                assert len(head) == n_variables

        values = np.array(hadm_data, dtype=np.float32)
        values = values[-24:]
        times = values[:, 0]
        values = values[:, 1:]

        assert len(values.shape) == 2
        assert values.shape[1] == n_variables

        hadm_variable_dict[hadm_id] = values
        hadm_id_list.append(hadm_id)
        all_values.append(values)

    all_values = np.concatenate(all_values, 0)
    ms = [all_values.mean(0), all_values.std(0)]

    hadm_dist_matrix = np.zeros((len(hadm_id_list), len(hadm_id_list))) - 1
    for i in tqdm(range(len(hadm_id_list))):
        hadm_dist_matrix[i, i] = 0
        for j in range(i + 1, len(hadm_id_list)):
            if hadm_dist_matrix[i, j] >= 0 or i == j:
                continue
            s1 = hadm_variable_dict[hadm_id_list[i]]
            s2 = hadm_variable_dict[hadm_id_list[j]]
            s1 = norm(s1, ms)
            s2 = norm(s2, ms)
            dist_mat = dist_func(s1, s2)
            path = np.zeros([dist_mat.shape[0], dist_mat.shape[1], 3
                             ]) - inf - 1
            compute_dtw(dist_mat, path, hadm_dist_matrix, i, j)

    py_op.mywritejson(os.path.join(subtyping_dir, 'hadm_id_list.json'),
                      hadm_id_list)
    np.save(os.path.join(subtyping_dir, 'hadm_dist_matrix.npy'),
            hadm_dist_matrix)