示例#1
0
from my_utils import data_util

if __name__ == '__main__':
    subtype = 'he'
    # hold_out_round = 1
    for hold_out_round in range(0, 10, 1):
        if subtype == 'is':
            sub_class = 'ischemic'
        else:
            sub_class = 'hemorrhagic'
        id_train_all, x_train_all, y_train_all = data_util.get_poor_god('training_' + subtype + '_' + str(hold_out_round) + '.csv', sub_class=sub_class)
        feature_names = x_train_all.columns.values
        forest = ExtraTreesClassifier()
        kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=hold_out_round)
        for index, (train, test) in enumerate(kfold.split(x_train_all, y_train_all)):
            x_train = data_util.scale(x_train_all.iloc[train])
            y_train = y_train_all.iloc[train]
            forest.fit(x_train, y_train)
            importances = forest.feature_importances_
            std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
            indices = np.argsort(importances)[::-1]
            # Print the feature ranking
            imp = []
            print("Feature ranking:")
            for i in range(x_train.shape[1]):
                print("%d. feature %d (%f) %s" % (i + 1, indices[i], importances[indices[i]], feature_names[indices[i]]))
                imp.append(importances[indices[i]])
            importance_df = pd.DataFrame(data={'f_index': indices, 'score': imp})
            importance_df.sort_values(by=['f_index'], inplace=True)
            importance_df.reset_index(drop=True, inplace=True)
            if index == 0:
示例#2
0
    #                                to_categorical(y_data.iloc[train]),
    #                                parameter)
    #     history_array.append(history)
    #     loss, acc = model.evaluate(data_util.scale(x_data.iloc[test]),
    #                                to_categorical(y_data.iloc[test]),
    #                                verbose=0)
    #     test_acc_array.append(acc)
    #     test_loss_array.append(loss)
    # ====== Binary
    id_data, x_data, y_data = data_util.get_poor_god(
        'wholeset_Jim_nomissing_validated.csv')
    for index, (train, test) in enumerate(kfold.split(x_data, y_data)):
        x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input(
            x_data.iloc[train])
        # x_train = data_util.kera_feature(x_data.iloc[train])
        history, model = mlp_binary(data_util.scale(x_train_cnn),
                                    to_categorical(y_data.iloc[train]),
                                    parameter)
        history_array.append(history)

        x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input(
            x_data.iloc[test])
        # x_test = data_util.kera_feature(x_data.iloc[test])
        loss, acc = model.evaluate(data_util.scale(x_test_cnn),
                                   to_categorical(y_data.iloc[test]),
                                   verbose=0)
        test_acc_array.append(acc)
        test_loss_array.append(loss)

        y_pred = model.predict(data_util.scale(x_test_cnn))
        # predict_array.append(y_pred, y_data.iloc[test])
示例#3
0
        id_data, x_data, y_data = data_util.get_poor_god(
            'wholeset_Jim_nomissing_validated_fs.csv')
        model_name = 'svm_2c_fs'
    else:
        id_data, x_data, y_data = data_util.get_poor_god(
            'wholeset_Jim_nomissing_validated_fs.csv')
        model_name = 'svm_2c_fe'

    kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
    classifier = SVC(kernel='linear',
                     probability=True,
                     random_state=seed,
                     verbose=True)
    for index, (train, test) in enumerate(kfold.split(x_data, y_data)):
        # Training
        x_train = data_util.scale(x_data.iloc[train])
        if experiment == 2:
            x_train = tsne.tsne_features_add(x_train, seed)
        classifier.fit(x_train, y_data.iloc[train])

        # Testing
        x_test = data_util.scale(x_data.iloc[test])
        if experiment == 2:
            x_test = tsne.tsne_features_add(x_test, seed)

        # Evaluation
        predict_result_train = id_data.iloc[train]
        train_probas = classifier.predict_proba(x_train)
        predict_result_train['label'] = y_data.iloc[train]
        predict_result_train['0'] = train_probas[:, 0]
        predict_result_train['1'] = train_probas[:, 1]
示例#4
0
    elif experiment == 1:
        id_data, x_data, y_data = data_util.get_poor_god(
            'wholeset_Jim_nomissing_validated_fs.csv')
        model_name = 'rf_2c_fs'
    else:
        id_data, x_data, y_data = data_util.get_poor_god(
            'wholeset_Jim_nomissing_validated_fs.csv')
        model_name = 'rf_2c_fe'

    kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
    rf = RandomForestClassifier(n_estimators=100,
                                criterion='entropy',
                                random_state=seed)
    for index, (train, test) in enumerate(kfold.split(x_data, y_data)):
        # Training
        x_train = data_util.scale(x_data.iloc[train])
        if experiment == 2:
            x_train = tsne.tsne_features_add(x_train, seed)
        rf.fit(data_util.scale(x_train), y_data.iloc[train])

        # Testing
        x_test = data_util.scale(x_data.iloc[test])
        if experiment == 2:
            x_test = tsne.tsne_features_add(x_test, seed)

        # Evaluation
        predict_result_train = id_data.iloc[train]
        train_probas = rf.predict_proba(x_train)
        predict_result_train['label'] = y_data.iloc[train]
        predict_result_train['0'] = train_probas[:, 0]
        predict_result_train['1'] = train_probas[:, 1]
示例#5
0
    test_loss_array = []
    kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

    for index, (train, test) in enumerate(kfold.split(x_data, y_data)):
        # training
        x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input(
            x_data.iloc[train])
        if experiment == 1:
            x_train_cnn, x_train_mlp = data_util.selected_cnn_mlp_input(
                x_train_cnn, x_train_mlp, selected_features)
        elif experiment == 2:
            x_train_cnn, x_train_mlp = data_util.selected_cnn_mlp_input(
                x_train_cnn, x_train_mlp, selected_features)
            x_train_mlp = tsne.tsne_features_add(x_train_mlp, seed)

        x_train_cnn = data_util.scale(x_train_cnn)
        x_train_mlp = data_util.scale(x_train_mlp)
        history, model = mlp_cnn_binary(x_train_cnn, x_train_mlp,
                                        to_categorical(y_data.iloc[train]),
                                        parameter, index)
        performance_util.save_train_validation(
            save_path + parameter['model_name'], history, 'acc', str(index))

        # Testing
        x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input(
            x_data.iloc[test])
        if experiment == 1:
            x_test_cnn, x_test_mlp = data_util.selected_cnn_mlp_input(
                x_test_cnn, x_test_mlp, selected_features)
        elif experiment == 2:
            x_test_cnn, x_test_mlp = data_util.selected_cnn_mlp_input(
示例#6
0
def do_svm(hold_out_round, sub_class, experiment):
    np.random.seed(hold_out_round)
    if sub_class == 'ischemic':
        id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample(
            'training_is_' + str(hold_out_round) + '.csv', sub_class=sub_class)
        id_hold, x_hold, y_hold = data_util.get_poor_god(
            'hold_is_' + str(hold_out_round) + '.csv', sub_class=sub_class)
    else:
        id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample(
            'training_he_' + str(hold_out_round) + '.csv', sub_class=sub_class)
        id_hold, x_hold, y_hold = data_util.get_poor_god(
            'hold_he_' + str(hold_out_round) + '.csv', sub_class=sub_class)
    #
    if experiment == 0:
        save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'all' + os.sep
        model_name = 'svm_' + sub_class + '_h_' + str(hold_out_round)
    elif experiment == 1:
        x_train_all = data_util.feature_selection(x_train_all, sub_class)
        x_hold = data_util.feature_selection(x_hold, sub_class)
        save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'fs' + os.sep
        model_name = 'svm_fs_' + sub_class + '_h_' + str(hold_out_round)
    elif experiment == 2:
        x_train_all = x_train_all.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                                       errors='ignore',
                                       axis=1)
        x_hold = x_hold.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                             errors='ignore',
                             axis=1)
        save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'all_nf' + os.sep
        model_name = 'svm_nf_' + sub_class + '_h_' + str(hold_out_round)
    else:
        x_train_all = data_util.feature_selection(x_train_all, sub_class)
        x_train_all = x_train_all.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                                       errors='ignore',
                                       axis=1)
        x_hold = data_util.feature_selection(x_hold, sub_class)
        x_hold = x_hold.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                             errors='ignore',
                             axis=1)
        save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'fs_nf' + os.sep
        model_name = 'svm_fs_nf_' + sub_class + '_h_' + str(hold_out_round)
    #
    test_acc_array = []
    kfold = StratifiedKFold(n_splits=10,
                            shuffle=True,
                            random_state=hold_out_round)
    classifier = SVC(kernel='linear',
                     probability=True,
                     random_state=hold_out_round,
                     verbose=True)
    for index, (train, test) in enumerate(kfold.split(x_train_all,
                                                      y_train_all)):
        # Training
        x_train = data_util.scale(x_train_all.iloc[train])
        y_train = y_train_all.iloc[train]
        # Testing
        x_test = data_util.scale(x_train_all.iloc[test])
        y_test = y_train_all.iloc[test]
        # train on 90% training
        classifier.fit(x_train, y_train)
        predict_result_train = id_train_all.iloc[train]
        train_probas = classifier.predict_proba(x_train)
        predict_result_train['label'] = y_train
        predict_result_train['0'] = train_probas[:, 0]
        predict_result_train['1'] = train_probas[:, 1]
        predict_result_train.to_csv(save_path + model_name + '_train_cv' +
                                    str(index) + '.csv',
                                    sep=',',
                                    encoding='utf-8')
        # Evaluation on 10% training
        predict_result_test = id_train_all.iloc[test]
        test_probas = classifier.predict_proba(x_test)
        predict_result_test['label'] = y_test
        predict_result_test['0'] = test_probas[:, 0]
        predict_result_test['1'] = test_probas[:, 1]
        predict_result_test.to_csv(save_path + model_name + '_test_cv' +
                                   str(index) + '.csv',
                                   sep=',',
                                   encoding='utf-8')
        test_acc = accuracy_score(y_test, classifier.predict(x_test))
        test_acc_array.append(test_acc)
        performance_util.save_model(classifier, model_name + '_' + str(index))
    print('10-CV Done')
    # --
    best_model_inx = test_acc_array.index(max(test_acc_array))
    hold_model = performance_util.load_ml_model(model_name, best_model_inx)
    x_hold = data_util.scale(x_hold)
    predict_result_hold = id_hold
    holdout_probas = hold_model.predict_proba(x_hold)
    predict_result_hold['label'] = y_hold
    predict_result_hold['0'] = holdout_probas[:, 0]
    predict_result_hold['1'] = holdout_probas[:, 1]
    predict_result_hold.to_csv(save_path + model_name + '_hold.csv',
                               sep=',',
                               encoding='utf-8')
    print('hold-out Done')
示例#7
0
def do_mlp_cnn(hold_out_round, sub_class, experiment):
    np.random.seed(hold_out_round)
    if sub_class == 'ischemic':
        id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample(
            'training_is_' + str(hold_out_round) + '.csv', sub_class=sub_class)
        id_hold, x_hold, y_hold = data_util.get_poor_god(
            'hold_is_' + str(hold_out_round) + '.csv', sub_class=sub_class)
    else:
        id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample(
            'training_he_' + str(hold_out_round) + '.csv', sub_class=sub_class)
        id_hold, x_hold, y_hold = data_util.get_poor_god(
            'hold_he_' + str(hold_out_round) + '.csv', sub_class=sub_class)
    #
    if experiment == 0:
        save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'all' + os.sep
        parameter = {
            'model_name': 'mlp_cnn_' + sub_class + '_h_' + str(hold_out_round),
            'size_of_batch': 56,
            'nb_epoch': 150,
            'drop_rate': 0.5
        }
    elif experiment == 1:
        save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'fs' + os.sep
        selected_features = data_util.get_selected_feature_name(sub_class)
        parameter = {
            'model_name':
            'mlp_cnn_fs_' + sub_class + '_h_' + str(hold_out_round),
            'size_of_batch': 56,
            'nb_epoch': 150,
            'drop_rate': 0.5
        }
    elif experiment == 2:
        save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'all_nf' + os.sep
        parameter = {
            'model_name':
            'mlp_cnn_nf_' + sub_class + '_h_' + str(hold_out_round),
            'size_of_batch': 56,
            'nb_epoch': 150,
            'drop_rate': 0.5
        }
    else:
        save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'fs_nf' + os.sep
        selected_features = data_util.get_selected_feature_name(sub_class)
        parameter = {
            'model_name':
            'mlp_cnn_fs_nf_' + sub_class + '_h_' + str(hold_out_round),
            'size_of_batch':
            56,
            'nb_epoch':
            150,
            'drop_rate':
            0.5
        }

    test_acc_array = []
    test_loss_array = []
    kfold = StratifiedKFold(n_splits=10,
                            shuffle=True,
                            random_state=hold_out_round)
    for index, (train, test) in enumerate(kfold.split(x_train_all,
                                                      y_train_all)):
        # training
        x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input(
            x_train_all.iloc[train])
        if experiment == 2:
            x_train_cnn = x_train_cnn.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                           errors='ignore',
                                           axis=1)
            x_train_mlp = x_train_mlp.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                           errors='ignore',
                                           axis=1)
        if experiment == 1 or experiment == 3:
            x_train_cnn, x_train_mlp = data_util.selected_cnn_mlp_input(
                x_train_cnn, x_train_mlp, selected_features)
            if experiment == 3:
                x_train_cnn = x_train_cnn.drop([
                    'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0',
                    'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1',
                    'VEIHD_1', 'MRS_TX_1'
                ],
                                               errors='ignore',
                                               axis=1)
                x_train_mlp = x_train_mlp.drop([
                    'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0',
                    'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1',
                    'VEIHD_1', 'MRS_TX_1'
                ],
                                               errors='ignore',
                                               axis=1)
        x_train_cnn = data_util.scale(x_train_cnn)
        x_train_mlp = data_util.scale(x_train_mlp)
        y_train = y_train_all.iloc[train]

        # Testing
        x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input(
            x_train_all.iloc[test])
        if experiment == 2:
            x_test_cnn = x_test_cnn.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                         errors='ignore',
                                         axis=1)
            x_test_mlp = x_test_mlp.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                         errors='ignore',
                                         axis=1)
        if experiment == 1 or experiment == 3:
            x_test_cnn, x_test_mlp = data_util.selected_cnn_mlp_input(
                x_test_cnn, x_test_mlp, selected_features)
            if experiment == 3:
                x_test_cnn = x_test_cnn.drop([
                    'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0',
                    'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1',
                    'VEIHD_1', 'MRS_TX_1'
                ],
                                             errors='ignore',
                                             axis=1)
                x_test_mlp = x_test_mlp.drop([
                    'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0',
                    'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1',
                    'VEIHD_1', 'MRS_TX_1'
                ],
                                             errors='ignore',
                                             axis=1)
        x_test_cnn = np.expand_dims(data_util.scale(x_test_cnn), 2)
        x_test_mlp = data_util.scale(x_test_mlp)
        y_test = y_train_all.iloc[test]

        # train on 90% training
        history, model = mlp_cnn_binary(x_train_cnn, x_train_mlp,
                                        to_categorical(y_train), parameter,
                                        index)
        performance_util.save_train_validation(
            save_path + parameter['model_name'], history, 'acc', str(index))
        predict_result_train = id_train_all.iloc[train]
        x_train_cnn = np.expand_dims(x_train_cnn, 2)
        train_probas = model.predict([x_train_cnn, x_train_mlp])
        predict_result_train['label'] = y_train
        predict_result_train['0'] = train_probas[:, 0]
        predict_result_train['1'] = train_probas[:, 1]
        predict_result_train.to_csv(save_path + parameter['model_name'] +
                                    '_train_cv' + str(index) + '.csv',
                                    sep=',',
                                    encoding='utf-8')
        # Evaluation on 10% training
        predict_result_test = id_train_all.iloc[test]
        test_probas = model.predict([x_test_cnn, x_test_mlp])
        predict_result_test['label'] = y_test
        predict_result_test['0'] = test_probas[:, 0]
        predict_result_test['1'] = test_probas[:, 1]
        predict_result_test.to_csv(save_path + parameter['model_name'] +
                                   '_test_cv' + str(index) + '.csv',
                                   sep=',',
                                   encoding='utf-8')

        loss, acc = model.evaluate([x_test_cnn, x_test_mlp],
                                   to_categorical(y_test),
                                   verbose=0)
        test_acc_array.append(acc)
        test_loss_array.append(loss)
        # plot_fig.plot_acc_loss(history, 'acc')
    performance_util.save_test(save_path + parameter['model_name'],
                               test_acc_array, test_loss_array)
    print('10-CV Done')
    # --
    best_model_inx = test_acc_array.index(max(test_acc_array))
    hold_model = performance_util.load_nn_model(parameter['model_name'],
                                                best_model_inx)
    x_hold_cnn, x_hold_mlp = data_util.split_cnn_mlp_input(x_hold)

    if experiment == 2:
        x_hold_cnn = x_hold_cnn.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                                     errors='ignore',
                                     axis=1)
        x_hold_mlp = x_hold_mlp.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                                     errors='ignore',
                                     axis=1)
    if experiment == 1 or experiment == 3:
        x_hold_cnn, x_hold_mlp = data_util.selected_cnn_mlp_input(
            x_hold_cnn, x_hold_mlp, selected_features)
        if experiment == 3:
            x_hold_cnn = x_hold_cnn.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                         errors='ignore',
                                         axis=1)
            x_hold_mlp = x_hold_mlp.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                         errors='ignore',
                                         axis=1)

    x_hold_cnn = np.expand_dims(data_util.scale(x_hold_cnn), 2)
    x_hold_mlp = data_util.scale(x_hold_mlp)
    predict_result_hold = id_hold
    holdout_probas = hold_model.predict([x_hold_cnn, x_hold_mlp])
    predict_result_hold['label'] = y_hold
    predict_result_hold['0'] = holdout_probas[:, 0]
    predict_result_hold['1'] = holdout_probas[:, 1]
    predict_result_hold.to_csv(save_path + parameter['model_name'] +
                               '_hold.csv',
                               sep=',',
                               encoding='utf-8')
    print('hold-out Done')
示例#8
0
    sub_class = 'hemorrhagic'
    id_data, x_data, y_data = data_util.get_poor_god(
        'wholeset_Jim_nomissing_validated.csv', sub_class)
    feature_names = x_data.columns.values

    # Create a selector object that will use the random forest classifier to identify
    # features that have an importance of more than XX
    rf = RandomForestClassifier(n_estimators=100,
                                criterion='entropy',
                                random_state=seed)
    sfm = SelectFromModel(rf, threshold=5e-3)
    x_data, x_hold, y_data, y_hold = train_test_split(x_data,
                                                      y_data,
                                                      test_size=0.3,
                                                      random_state=seed)
    rf.fit(data_util.scale(x_data), y_data)
    importances = rf.feature_importances_
    plot_all_features(importances, feature_names)

    # Train the selector for all dataset
    sfm.fit(x_data, y_data)
    selected_feature_names = feature_names[sfm.get_support(indices=True)]
    for sfn in reversed(selected_feature_names):
        print(sfn)
    data_util.save_np_array_to_csv(selected_feature_names,
                                   'selected_features_' + sub_class)
    '''
    id_data_all, x_data_all, y_data_all = data_util.get_individual('wholeset_Jim_nomissing_validated.csv')
    x_data_selected = x_data_all[x_data_all.columns[sfm.get_support(indices=True)]]
    data_fs = pd.concat([id_data_all, x_data_selected, y_data_all], axis=1)
    data_util.save_dataframe_to_csv(data_fs, 'wholeset_Jim_nomissing_validated_fs')
示例#9
0
        'nb_epoch': 500,
        'drop_rate': 0.
    }
    kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
    history_array = []
    test_acc_array = []
    test_loss_array = []
    predict_array = []
    # ====== Binary
    id_data, x_data, y_data = data_util.get_poor_god(
        'wholeset_Jim_nomissing_validated.csv')
    for index, (train, test) in enumerate(kfold.split(x_data, y_data)):
        x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input(
            x_data.iloc[train])

        history, model = cnn_binary(data_util.scale(x_train_cnn),
                                    to_categorical(y_data.iloc[train]),
                                    parameter)
        history_array.append(history)

        x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input(
            x_data.iloc[test])
        x_test_cnn = np.expand_dims(data_util.scale(x_test_cnn), 2)
        loss, acc = model.evaluate(x_test_cnn,
                                   to_categorical(y_data.iloc[test]),
                                   verbose=0)
        test_acc_array.append(acc)
        test_loss_array.append(loss)

        y_pred = model.predict(x_test_cnn)
        # predict_array.append(y_pred, y_data.iloc[test])