Exemplo n.º 1
0
def check_interpolation_and_subgraphs():
    ft = 'raw'
    minp = 0.5
    minc = 0.6
    seed = 2222

    standardize_method = "cz"
    is_cz = True
    # standardize_method = "z"
    # is_cz = False

    freq_list = [
        '001', '002', '003', '004', '005', '006', '008', '009', '01', '011'
    ]
    freq_to_trainFreq_map = {
        '001': '001',
        '002': '002',
        '003': '004',
        '004': '005',
        '005': '007',
        '006': '008',
        '008': '01',
        '009': '011',
        '01': '013',
        '011': '014'
    }
    nel_graph_length = 13

    fout = '../observer/check_interpolation_and_subgraphs/seed%s_%s_mice_mp%s_mc%s_%s' % (
        seed, ft, minp, minc, standardize_method)
    cu.checkAndCreate(fout)

    cu.checkAndCreate('../data/seed%s/%s/mice/mp%s_mc%s/%s' %
                      (seed, ft, minp, minc, standardize_method))
    e = run.Experiment(
        '../data/seed%s/%s/mice/mp%s_mc%s/%s' %
        (seed, ft, minp, minc, standardize_method),
        '../data/seed%s/%s/mice/mp%s_mc%s/dataset' % (seed, ft, minp, minc),
        seed, is_cz, standardize_method, freq_list, freq_to_trainFreq_map,
        nel_graph_length)

    foldi = 2

    train = e.ftrain % (e.dataset_folder, foldi, e.standardize_method)
    test = e.ftest % (e.dataset_folder, foldi, e.standardize_method)

    print train
    print test

    ftrnel = "%s/mimic_train_fold%d.nel" % (fout, foldi)
    ftrnode = "%s/mimic_train_fold%d.node" % (fout, foldi)
    fnel = "%s/mimic_fold%d.nel" % (fout, foldi)
    fnode = "%s/mimic_fold%d.node" % (fout, foldi)

    # e.interpolation(trcsv=train, tecsv=test, ftrnel=ftrnel, ftrnode=ftrnode, fnel=fnel, fnode=fnode)
    e.subgraph_mining(tr_nel=ftrnel,
                      tr_te_nel=fnel,
                      freq_t='011',
                      foldi=foldi,
                      cfolder=fout)
Exemplo n.º 2
0
def split_test_by_patient(cdn, out_folder, suffix=''):
    '''
	This is a help function for MICE imputation.
	Given folder, extract data for each patient to be used in MICE
	'''
    for i in range(5):
        test = pd.read_csv('%s/test_fold%d%s.csv' % (cdn, i, suffix))
        gp = test.groupby('sid')
        cu.checkAndCreate('%s/test_fold%d' % (out_folder, i))
        fn = open('%s/test_fold%d/sid_list.txt' % (out_folder, i), 'w')
        for sid, group in gp:
            group.to_csv('%s/test_fold%d/%d.csv' % (out_folder, i, sid),
                         index=False)
            fn.write('%d\n' % sid)
        fn.close()
Exemplo n.º 3
0
def split_by_feature_type(cdn, fn_prefix):
    '''
	split data into two sets, one contains raw features + medical features, 
	another contains standardized features + medical features.
	'''
    print 'split_by_feature_type'
    for i in range(5):
        training = pd.read_csv('%s_train_fold%d.csv' % (fn_prefix, i))
        testing = pd.read_csv('%s_test_fold%d.csv' % (fn_prefix, i))
        raw_train = training[raw_features_for_classify]
        raw_test = testing[raw_features_for_classify]
        # z_train = training[standardized_features_for_classify]
        # z_test = testing[standardized_features_for_classify]
        cu.checkAndCreate('%s/raw/' % cdn)
        # cu.checkAndCreate('%s/z/'%cdn)
        raw_train.to_csv('%s/raw/train_fold%d.csv' % (cdn, i), index=False)
        raw_test.to_csv('%s/raw/test_fold%d.csv' % (cdn, i), index=False)
Exemplo n.º 4
0
def split_by_feature_type(cdn, fn_prefix, raw_colname, z_colname):
    '''
	split data into two sets, one contains raw features + medical features, 
	another contains standardized features + medical features.
	'''
    for i in range(5):
        training = pd.read_csv('%s_train_fold%d.csv' % (fn_prefix, i))
        testing = pd.read_csv('%s_test_fold%d.csv' % (fn_prefix, i))
        raw_train = training[raw_colname]
        raw_test = testing[raw_colname]
        z_train = training[z_colname]
        z_test = testing[z_colname]
        cu.checkAndCreate('%s/raw/' % cdn)
        cu.checkAndCreate('%s/z/' % cdn)
        raw_train.to_csv('%s/raw/train_fold%d.csv' % (cdn, i), index=False)
        raw_test.to_csv('%s/raw/test_fold%d.csv' % (cdn, i), index=False)
        z_train.to_csv('%s/z/train_fold%d.csv' % (cdn, i), index=False)
        z_test.to_csv('%s/z/test_fold%d.csv' % (cdn, i), index=False)
Exemplo n.º 5
0
    def nmfClassfyExperiments(l):
        minp = l[0]
        minc = l[1]
        print minp, minc

        output = ''
        for isg in [0, 3]:
            output += 'isg %d:\n' % isg

            cu.checkAndCreate('%s/isg%d/nmf_piks' % (self.cdn, isg))

            for freq_t in [
                    '001', '002', '003', '004', '005', '006', '008', '009',
                    '01', '011'
            ]:
                output += 'freq_t %s:\n' % freq_t
                bauc = 0.
                htauc = 0.
                bnc = 0
                for nc in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]:
                    for foldi in range(5):
                        nmfclassify(isg,
                                    freq_t,
                                    2,
                                    foldi,
                                    nc=nc,
                                    minp=minp,
                                    minc=minc)
                    best_auc, highest_tr_auc = mean_auc(isg,
                                                        freq_t,
                                                        2,
                                                        nc=nc,
                                                        nmf=True)
                    if best_auc > bauc:
                        bnc = nc
                    bauc = max(bauc, best_auc)
                    htauc = max(htauc, highest_tr_auc)
                output += 'nc %s: %s\n%s\n' % (bnc, bauc, htauc)
                # output += '\n'
            output += '\n'
        fn = open('../data/mice/nmfresult_mp%s_mc%s.txt' % (minp, minc), 'w')
        fn.write(output)
        fn.close()
Exemplo n.º 6
0
    def tuneSGParamForClassification(nmf=False):
        output = ''
        for isg in [0, 3]:
            output += 'isg %d: ' % isg
            if nmf:
                cu.checkAndCreate('%s/isg%d/nmf_piks' % (self.cdn, isg))

            for freq_t in [
                    '001', '002', '003', '004', '005', '006', '008', '009',
                    '01', '011'
            ]:

                prediction_matrics = read_prediction_matrics(isg, freq_t)

                if nmf:
                    bauc = 0.
                    tbtauc = 0.
                    bparams = ''
                    for nc in [
                            10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120
                    ]:
                        (tbauc, tbparams,
                         tbtauc) = tuneCLFParamForClassification(
                             l, prediction_matrics, nmf=True, nc=nc)
                        if tbauc > bauc:
                            bauc = tbauc
                            bparams = tbparams
                            btauc = tbtauc
                else:
                    (bauc, bparams,
                     btauc) = tuneCLFParamForClassification(l,
                                                            prediction_matrics,
                                                            nmf=False)

            output += '%f (%s)\n' % (bauc, bparams)

        if nmf:
            fn = open('%s/nmfResult.txt' % (cdn), 'w')
        else:
            fn = open('%s/dirResult.txt' % (cdn), 'w')
        fn.write(output)
        fn.close()
Exemplo n.º 7
0
    def run(feature_type, minp, minc):
        # self.cdn = '../data/mean_last12h'
        # self.cdn = '../data/seed2222/%s/mice/mp%s_mc%s'%(feature_type,minp,minc)
        # print self.cdn
        cu.checkAndCreate(self.cdn)
        for isg in [0, 3]:
            cu.checkAndCreate('%s/isg%d' % (self.cdn, isg))
            cu.checkAndCreate('%s/isg%d/pt_sg_w' % (self.cdn, isg))
            cu.checkAndCreate('%s/isg%d/res' % (self.cdn, isg))

        for foldi in range(5):

            train = self.ftrain % (self.cdn, foldi)
            test = self.ftest % (self.cdn, foldi)

            ftrnel = "%s/mimic_train_fold%d.nel" % (self.cdn, foldi)
            ftrnode = "%s/mimic_train_fold%d.node" % (self.cdn, foldi)
            fnel = "%s/mimic_fold%d.nel" % (self.cdn, foldi)
            fnode = "%s/mimic_fold%d.node" % (self.cdn, foldi)

            interpolation(trcsv=train,
                          tecsv=test,
                          ftrnel=ftrnel,
                          ftrnode=ftrnode,
                          fnel=fnel,
                          fnode=fnode)

            get_freq_to_trainFreq_map(foldi)

            for freq_t in [
                    '001', '002', '003', '004', '005', '006', '008', '009',
                    '01', '011'
            ]:
                subgraph_mining(tr_nel=ftrnel,
                                tr_te_nel=fnel,
                                freq_t=freq_t,
                                foldi=foldi)

                for isg in [0, 3]:
                    gen_pt_sg_files(isg, freq_t, foldi)
Exemplo n.º 8
0
def run_best_model(cdn):
    ft = 'raw'
    seed = 2222
    standardize_method = 'z'
    is_cz = False

    cu.checkAndCreate('%s/seed%d' % (cdn, seed))
    pp.split_nfolds('%s/alldata_readmit.csv' % cdn,
                    '%s/seed%d/alldata_readmit' % (cdn, seed),
                    shuffle=True,
                    seed=seed)
    pp.split_by_feature_type(cdn='%s/seed%d' % (cdn, seed),
                             fn_prefix='%s/seed%d/alldata_readmit' %
                             (cdn, seed))

    cu.checkAndCreate('%s/seed%d/raw/interp' % (cdn, seed))
    cu.checkAndCreate('%s/seed%d/raw/interp/mean/dataset' % (cdn, seed))
    for i in range(5):
        pp.impute_by_interpolation_on_last12h(
            '%s/seed%d/raw/test_fold%d.csv' % (cdn, seed, i),
            '%s/seed%d/raw/interp/test_fold%d.csv' % (cdn, seed, i),
            '%s/seed%d/raw/interp/extrapolation_log_test_fold%d.txt' %
            (cdn, seed, i))
        pp.impute_by_interpolation_on_last12h(
            '%s/seed%d/raw/train_fold%d.csv' % (cdn, seed, i),
            '%s/seed%d/raw/interp/train_fold%d.csv' % (cdn, seed, i),
            '%s/seed%d/raw/interp/extrapolation_log_train_fold%d.txt' %
            (cdn, seed, i))
        pp.impute_by_mean(
            '%s/seed%d/raw/interp/train_fold%d.csv' % (cdn, seed, i),
            '%s/seed%d/raw/interp/test_fold%d.csv' % (cdn, seed, i),
            '%s/seed%d/raw/interp/mean/dataset/train_fold%d.csv' %
            (cdn, seed, i),
            '%s/seed%d/raw/interp/mean/dataset/test_fold%d.csv' %
            (cdn, seed, i))
        pp.standardize_data(
            '%s/seed%d/raw/interp/mean/dataset/train_fold%d.csv' %
            (cdn, seed, i),
            '%s/seed%d/raw/interp/mean/dataset/test_fold%d.csv' %
            (cdn, seed, i),
            '%s/seed%d/raw/interp/mean/dataset/train_fold%d_%s.csv' %
            (cdn, seed, i, standardize_method),
            '%s/seed%d/raw/interp/mean/dataset/test_fold%d_%s.csv' %
            (cdn, seed, i, standardize_method))

    # run temporal model
    freq_list = ['011']
    freq_to_trainFreq_map = {'011': '014'}
    nel_graph_length = 13

    cu.checkAndCreate('%s/seed%d/%s/interp/mean/%s' %
                      (cdn, seed, ft, standardize_method))
    e = rn.Experiment(
        '%s/seed%d/%s/interp/mean/%s' % (cdn, seed, ft, standardize_method),
        '%s/seed%d/%s/interp/mean/dataset' % (cdn, seed, ft), seed, is_cz,
        standardize_method, freq_list, freq_to_trainFreq_map, nel_graph_length)

    isg = 0
    freq_t = '011'
    nc = 110
    c = 2
    pl = 'l1'
    cw = 'balanced'
    ntestth = 2

    cu.checkAndCreate('%s/isg%d' % (e.cdn, isg))
    cu.checkAndCreate('%s/isg%d/pt_sg_w' % (e.cdn, isg))
    cu.checkAndCreate('%s/isg%d/res' % (e.cdn, isg))
    cu.checkAndCreate('%s/isg%d/nmf_piks' % (e.cdn, isg))

    for foldi in range(5):

        train = e.ftrain % (e.dataset_folder, foldi, e.standardize_method)
        test = e.ftest % (e.dataset_folder, foldi, e.standardize_method)

        print train
        print test

        ftrnel = "%s/mimic_train_fold%d.nel" % (e.cdn, foldi)
        ftrnode = "%s/mimic_train_fold%d.node" % (e.cdn, foldi)
        fnel = "%s/mimic_fold%d.nel" % (e.cdn, foldi)
        fnode = "%s/mimic_fold%d.node" % (e.cdn, foldi)

        e.interpolation(trcsv=train,
                        tecsv=test,
                        ftrnel=ftrnel,
                        ftrnode=ftrnode,
                        fnel=fnel,
                        fnode=fnode)

        e.get_freq_to_trainFreq_map(foldi)

        for freq_t in e.moss_freq_threshold_list:
            e.subgraph_mining(tr_nel=ftrnel,
                              tr_te_nel=fnel,
                              freq_t=freq_t,
                              foldi=foldi)

            e.gen_pt_sg_files(isg, freq_t, foldi)

    cu.checkAndCreate('%s/seed%d/raw/interp/mean/last_measures/dataset' %
                      (cdn, seed))
    # run baseline model
    for i in range(5):
        pp.get_last_measurements(
            '%s/seed%d/raw/interp/mean/dataset/train_fold%d_%s.csv' %
            (cdn, seed, i, standardize_method),
            '%s/seed%d/raw/interp/mean/last_measures/dataset/train_fold%d_%s.csv'
            % (cdn, seed, i, standardize_method))
        pp.get_last_measurements(
            '%s/seed%d/raw/interp/mean/dataset/test_fold%d_%s.csv' %
            (cdn, seed, i, standardize_method),
            '%s/seed%d/raw/interp/mean/last_measures/dataset/test_fold%d_%s.csv'
            % (cdn, seed, i, standardize_method))

    best_features = rfe(
        '%s/seed%d/raw/interp/mean/last_measures' % (cdn, seed), 50,
        standardize_method, 5, 'l1', 'balanced')
    print best_features

    # best_features = ['urineByHrByWeight', 'HCT', 'INR', 'Platelets', 'RBC',
    # 'DeliveredTidalVolume', 'PlateauPres', 'RAW', 'RSBI', 'mDBP', 'CV_HR',
    # 'Art_BE', 'Art_CO2', 'Art_PaCO2', 'Art_pH', 'Cl', 'Mg', 'Anticoagulant',
    # 'beta.Blocking_agent', 'Somatostatin_preparation', 'Vasodilating_agent',
    # 'AIDS', 'MetCarcinoma']

    baseline_auc = lr('%s/seed%d/raw/interp/mean/last_measures' % (cdn, seed),
                      standardize_method, 5, 'l1', 'balanced', 50)
    print 'baseline AUC: %s' % baseline_auc

    res_list = []
    for foldi in range(5):
        fnaddtr = '../data/seed2222/raw/interp/mean/last_measures/dataset/train_fold%d_%s.csv' % (
            foldi, standardize_method)
        fnaddte = '../data/seed2222/raw/interp/mean/last_measures/dataset/test_fold%d_%s.csv' % (
            foldi, standardize_method)
        prediction_matrics = e.read_prediction_matrics(isg, freq_t)
        (res, gt_te, pt_te, res_baseline) = e.nmfClassify_ob(
            prediction_matrics['ptsg'][foldi],
            prediction_matrics['ptwd'][foldi],
            prediction_matrics['sgs'][foldi], prediction_matrics['pt'][foldi],
            prediction_matrics['gt'][foldi],
            '%s/isg%d/nmf_piks/nmf_%s_fold%d_%d.pik' %
            (e.cdn, isg, freq_t, foldi, nc), ntestth, foldi, nc, c, pl, cw,
            fnaddtr, fnaddte, best_features)
        res_list.append(res)

    (auc, tr_auc) = e.get_mean_auc(res_list)
    print auc, tr_auc

    for i in range(len(res_list)):
        with open(
                '../data/seed2222/raw/interp/mean/z/isg0/res/c_pre_te_fold%d' %
                i, 'wb') as f:
            pickle.dump(res_list[i]['c_pre_te'], f)
        with open('../data/seed2222/raw/interp/mean/z/isg0/res/res_fold%d' % i,
                  'wb') as f:
            pickle.dump(res_list[i], f)