from my_utils import data_util if __name__ == '__main__': subtype = 'he' # hold_out_round = 1 for hold_out_round in range(0, 10, 1): if subtype == 'is': sub_class = 'ischemic' else: sub_class = 'hemorrhagic' id_train_all, x_train_all, y_train_all = data_util.get_poor_god('training_' + subtype + '_' + str(hold_out_round) + '.csv', sub_class=sub_class) feature_names = x_train_all.columns.values forest = ExtraTreesClassifier() kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=hold_out_round) for index, (train, test) in enumerate(kfold.split(x_train_all, y_train_all)): x_train = data_util.scale(x_train_all.iloc[train]) y_train = y_train_all.iloc[train] forest.fit(x_train, y_train) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking imp = [] print("Feature ranking:") for i in range(x_train.shape[1]): print("%d. feature %d (%f) %s" % (i + 1, indices[i], importances[indices[i]], feature_names[indices[i]])) imp.append(importances[indices[i]]) importance_df = pd.DataFrame(data={'f_index': indices, 'score': imp}) importance_df.sort_values(by=['f_index'], inplace=True) importance_df.reset_index(drop=True, inplace=True) if index == 0:
# to_categorical(y_data.iloc[train]), # parameter) # history_array.append(history) # loss, acc = model.evaluate(data_util.scale(x_data.iloc[test]), # to_categorical(y_data.iloc[test]), # verbose=0) # test_acc_array.append(acc) # test_loss_array.append(loss) # ====== Binary id_data, x_data, y_data = data_util.get_poor_god( 'wholeset_Jim_nomissing_validated.csv') for index, (train, test) in enumerate(kfold.split(x_data, y_data)): x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input( x_data.iloc[train]) # x_train = data_util.kera_feature(x_data.iloc[train]) history, model = mlp_binary(data_util.scale(x_train_cnn), to_categorical(y_data.iloc[train]), parameter) history_array.append(history) x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input( x_data.iloc[test]) # x_test = data_util.kera_feature(x_data.iloc[test]) loss, acc = model.evaluate(data_util.scale(x_test_cnn), to_categorical(y_data.iloc[test]), verbose=0) test_acc_array.append(acc) test_loss_array.append(loss) y_pred = model.predict(data_util.scale(x_test_cnn)) # predict_array.append(y_pred, y_data.iloc[test])
id_data, x_data, y_data = data_util.get_poor_god( 'wholeset_Jim_nomissing_validated_fs.csv') model_name = 'svm_2c_fs' else: id_data, x_data, y_data = data_util.get_poor_god( 'wholeset_Jim_nomissing_validated_fs.csv') model_name = 'svm_2c_fe' kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed) classifier = SVC(kernel='linear', probability=True, random_state=seed, verbose=True) for index, (train, test) in enumerate(kfold.split(x_data, y_data)): # Training x_train = data_util.scale(x_data.iloc[train]) if experiment == 2: x_train = tsne.tsne_features_add(x_train, seed) classifier.fit(x_train, y_data.iloc[train]) # Testing x_test = data_util.scale(x_data.iloc[test]) if experiment == 2: x_test = tsne.tsne_features_add(x_test, seed) # Evaluation predict_result_train = id_data.iloc[train] train_probas = classifier.predict_proba(x_train) predict_result_train['label'] = y_data.iloc[train] predict_result_train['0'] = train_probas[:, 0] predict_result_train['1'] = train_probas[:, 1]
elif experiment == 1: id_data, x_data, y_data = data_util.get_poor_god( 'wholeset_Jim_nomissing_validated_fs.csv') model_name = 'rf_2c_fs' else: id_data, x_data, y_data = data_util.get_poor_god( 'wholeset_Jim_nomissing_validated_fs.csv') model_name = 'rf_2c_fe' kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed) rf = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=seed) for index, (train, test) in enumerate(kfold.split(x_data, y_data)): # Training x_train = data_util.scale(x_data.iloc[train]) if experiment == 2: x_train = tsne.tsne_features_add(x_train, seed) rf.fit(data_util.scale(x_train), y_data.iloc[train]) # Testing x_test = data_util.scale(x_data.iloc[test]) if experiment == 2: x_test = tsne.tsne_features_add(x_test, seed) # Evaluation predict_result_train = id_data.iloc[train] train_probas = rf.predict_proba(x_train) predict_result_train['label'] = y_data.iloc[train] predict_result_train['0'] = train_probas[:, 0] predict_result_train['1'] = train_probas[:, 1]
test_loss_array = [] kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed) for index, (train, test) in enumerate(kfold.split(x_data, y_data)): # training x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input( x_data.iloc[train]) if experiment == 1: x_train_cnn, x_train_mlp = data_util.selected_cnn_mlp_input( x_train_cnn, x_train_mlp, selected_features) elif experiment == 2: x_train_cnn, x_train_mlp = data_util.selected_cnn_mlp_input( x_train_cnn, x_train_mlp, selected_features) x_train_mlp = tsne.tsne_features_add(x_train_mlp, seed) x_train_cnn = data_util.scale(x_train_cnn) x_train_mlp = data_util.scale(x_train_mlp) history, model = mlp_cnn_binary(x_train_cnn, x_train_mlp, to_categorical(y_data.iloc[train]), parameter, index) performance_util.save_train_validation( save_path + parameter['model_name'], history, 'acc', str(index)) # Testing x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input( x_data.iloc[test]) if experiment == 1: x_test_cnn, x_test_mlp = data_util.selected_cnn_mlp_input( x_test_cnn, x_test_mlp, selected_features) elif experiment == 2: x_test_cnn, x_test_mlp = data_util.selected_cnn_mlp_input(
def do_svm(hold_out_round, sub_class, experiment): np.random.seed(hold_out_round) if sub_class == 'ischemic': id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample( 'training_is_' + str(hold_out_round) + '.csv', sub_class=sub_class) id_hold, x_hold, y_hold = data_util.get_poor_god( 'hold_is_' + str(hold_out_round) + '.csv', sub_class=sub_class) else: id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample( 'training_he_' + str(hold_out_round) + '.csv', sub_class=sub_class) id_hold, x_hold, y_hold = data_util.get_poor_god( 'hold_he_' + str(hold_out_round) + '.csv', sub_class=sub_class) # if experiment == 0: save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'all' + os.sep model_name = 'svm_' + sub_class + '_h_' + str(hold_out_round) elif experiment == 1: x_train_all = data_util.feature_selection(x_train_all, sub_class) x_hold = data_util.feature_selection(x_hold, sub_class) save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'fs' + os.sep model_name = 'svm_fs_' + sub_class + '_h_' + str(hold_out_round) elif experiment == 2: x_train_all = x_train_all.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_hold = x_hold.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'all_nf' + os.sep model_name = 'svm_nf_' + sub_class + '_h_' + str(hold_out_round) else: x_train_all = data_util.feature_selection(x_train_all, sub_class) x_train_all = x_train_all.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_hold = data_util.feature_selection(x_hold, sub_class) x_hold = x_hold.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'fs_nf' + os.sep model_name = 'svm_fs_nf_' + sub_class + '_h_' + str(hold_out_round) # test_acc_array = [] kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=hold_out_round) classifier = SVC(kernel='linear', probability=True, random_state=hold_out_round, verbose=True) for index, (train, test) in enumerate(kfold.split(x_train_all, y_train_all)): # Training x_train = data_util.scale(x_train_all.iloc[train]) y_train = y_train_all.iloc[train] # Testing x_test = data_util.scale(x_train_all.iloc[test]) y_test = y_train_all.iloc[test] # train on 90% training classifier.fit(x_train, y_train) predict_result_train = id_train_all.iloc[train] train_probas = classifier.predict_proba(x_train) predict_result_train['label'] = y_train predict_result_train['0'] = train_probas[:, 0] predict_result_train['1'] = train_probas[:, 1] predict_result_train.to_csv(save_path + model_name + '_train_cv' + str(index) + '.csv', sep=',', encoding='utf-8') # Evaluation on 10% training predict_result_test = id_train_all.iloc[test] test_probas = classifier.predict_proba(x_test) predict_result_test['label'] = y_test predict_result_test['0'] = test_probas[:, 0] predict_result_test['1'] = test_probas[:, 1] predict_result_test.to_csv(save_path + model_name + '_test_cv' + str(index) + '.csv', sep=',', encoding='utf-8') test_acc = accuracy_score(y_test, classifier.predict(x_test)) test_acc_array.append(test_acc) performance_util.save_model(classifier, model_name + '_' + str(index)) print('10-CV Done') # -- best_model_inx = test_acc_array.index(max(test_acc_array)) hold_model = performance_util.load_ml_model(model_name, best_model_inx) x_hold = data_util.scale(x_hold) predict_result_hold = id_hold holdout_probas = hold_model.predict_proba(x_hold) predict_result_hold['label'] = y_hold predict_result_hold['0'] = holdout_probas[:, 0] predict_result_hold['1'] = holdout_probas[:, 1] predict_result_hold.to_csv(save_path + model_name + '_hold.csv', sep=',', encoding='utf-8') print('hold-out Done')
def do_mlp_cnn(hold_out_round, sub_class, experiment): np.random.seed(hold_out_round) if sub_class == 'ischemic': id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample( 'training_is_' + str(hold_out_round) + '.csv', sub_class=sub_class) id_hold, x_hold, y_hold = data_util.get_poor_god( 'hold_is_' + str(hold_out_round) + '.csv', sub_class=sub_class) else: id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample( 'training_he_' + str(hold_out_round) + '.csv', sub_class=sub_class) id_hold, x_hold, y_hold = data_util.get_poor_god( 'hold_he_' + str(hold_out_round) + '.csv', sub_class=sub_class) # if experiment == 0: save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'all' + os.sep parameter = { 'model_name': 'mlp_cnn_' + sub_class + '_h_' + str(hold_out_round), 'size_of_batch': 56, 'nb_epoch': 150, 'drop_rate': 0.5 } elif experiment == 1: save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'fs' + os.sep selected_features = data_util.get_selected_feature_name(sub_class) parameter = { 'model_name': 'mlp_cnn_fs_' + sub_class + '_h_' + str(hold_out_round), 'size_of_batch': 56, 'nb_epoch': 150, 'drop_rate': 0.5 } elif experiment == 2: save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'all_nf' + os.sep parameter = { 'model_name': 'mlp_cnn_nf_' + sub_class + '_h_' + str(hold_out_round), 'size_of_batch': 56, 'nb_epoch': 150, 'drop_rate': 0.5 } else: save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'fs_nf' + os.sep selected_features = data_util.get_selected_feature_name(sub_class) parameter = { 'model_name': 'mlp_cnn_fs_nf_' + sub_class + '_h_' + str(hold_out_round), 'size_of_batch': 56, 'nb_epoch': 150, 'drop_rate': 0.5 } test_acc_array = [] test_loss_array = [] kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=hold_out_round) for index, (train, test) in enumerate(kfold.split(x_train_all, y_train_all)): # training x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input( x_train_all.iloc[train]) if experiment == 2: x_train_cnn = x_train_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_train_mlp = x_train_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) if experiment == 1 or experiment == 3: x_train_cnn, x_train_mlp = data_util.selected_cnn_mlp_input( x_train_cnn, x_train_mlp, selected_features) if experiment == 3: x_train_cnn = x_train_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_train_mlp = x_train_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_train_cnn = data_util.scale(x_train_cnn) x_train_mlp = data_util.scale(x_train_mlp) y_train = y_train_all.iloc[train] # Testing x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input( x_train_all.iloc[test]) if experiment == 2: x_test_cnn = x_test_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_test_mlp = x_test_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) if experiment == 1 or experiment == 3: x_test_cnn, x_test_mlp = data_util.selected_cnn_mlp_input( x_test_cnn, x_test_mlp, selected_features) if experiment == 3: x_test_cnn = x_test_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_test_mlp = x_test_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_test_cnn = np.expand_dims(data_util.scale(x_test_cnn), 2) x_test_mlp = data_util.scale(x_test_mlp) y_test = y_train_all.iloc[test] # train on 90% training history, model = mlp_cnn_binary(x_train_cnn, x_train_mlp, to_categorical(y_train), parameter, index) performance_util.save_train_validation( save_path + parameter['model_name'], history, 'acc', str(index)) predict_result_train = id_train_all.iloc[train] x_train_cnn = np.expand_dims(x_train_cnn, 2) train_probas = model.predict([x_train_cnn, x_train_mlp]) predict_result_train['label'] = y_train predict_result_train['0'] = train_probas[:, 0] predict_result_train['1'] = train_probas[:, 1] predict_result_train.to_csv(save_path + parameter['model_name'] + '_train_cv' + str(index) + '.csv', sep=',', encoding='utf-8') # Evaluation on 10% training predict_result_test = id_train_all.iloc[test] test_probas = model.predict([x_test_cnn, x_test_mlp]) predict_result_test['label'] = y_test predict_result_test['0'] = test_probas[:, 0] predict_result_test['1'] = test_probas[:, 1] predict_result_test.to_csv(save_path + parameter['model_name'] + '_test_cv' + str(index) + '.csv', sep=',', encoding='utf-8') loss, acc = model.evaluate([x_test_cnn, x_test_mlp], to_categorical(y_test), verbose=0) test_acc_array.append(acc) test_loss_array.append(loss) # plot_fig.plot_acc_loss(history, 'acc') performance_util.save_test(save_path + parameter['model_name'], test_acc_array, test_loss_array) print('10-CV Done') # -- best_model_inx = test_acc_array.index(max(test_acc_array)) hold_model = performance_util.load_nn_model(parameter['model_name'], best_model_inx) x_hold_cnn, x_hold_mlp = data_util.split_cnn_mlp_input(x_hold) if experiment == 2: x_hold_cnn = x_hold_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_hold_mlp = x_hold_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) if experiment == 1 or experiment == 3: x_hold_cnn, x_hold_mlp = data_util.selected_cnn_mlp_input( x_hold_cnn, x_hold_mlp, selected_features) if experiment == 3: x_hold_cnn = x_hold_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_hold_mlp = x_hold_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_hold_cnn = np.expand_dims(data_util.scale(x_hold_cnn), 2) x_hold_mlp = data_util.scale(x_hold_mlp) predict_result_hold = id_hold holdout_probas = hold_model.predict([x_hold_cnn, x_hold_mlp]) predict_result_hold['label'] = y_hold predict_result_hold['0'] = holdout_probas[:, 0] predict_result_hold['1'] = holdout_probas[:, 1] predict_result_hold.to_csv(save_path + parameter['model_name'] + '_hold.csv', sep=',', encoding='utf-8') print('hold-out Done')
sub_class = 'hemorrhagic' id_data, x_data, y_data = data_util.get_poor_god( 'wholeset_Jim_nomissing_validated.csv', sub_class) feature_names = x_data.columns.values # Create a selector object that will use the random forest classifier to identify # features that have an importance of more than XX rf = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=seed) sfm = SelectFromModel(rf, threshold=5e-3) x_data, x_hold, y_data, y_hold = train_test_split(x_data, y_data, test_size=0.3, random_state=seed) rf.fit(data_util.scale(x_data), y_data) importances = rf.feature_importances_ plot_all_features(importances, feature_names) # Train the selector for all dataset sfm.fit(x_data, y_data) selected_feature_names = feature_names[sfm.get_support(indices=True)] for sfn in reversed(selected_feature_names): print(sfn) data_util.save_np_array_to_csv(selected_feature_names, 'selected_features_' + sub_class) ''' id_data_all, x_data_all, y_data_all = data_util.get_individual('wholeset_Jim_nomissing_validated.csv') x_data_selected = x_data_all[x_data_all.columns[sfm.get_support(indices=True)]] data_fs = pd.concat([id_data_all, x_data_selected, y_data_all], axis=1) data_util.save_dataframe_to_csv(data_fs, 'wholeset_Jim_nomissing_validated_fs')
'nb_epoch': 500, 'drop_rate': 0. } kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed) history_array = [] test_acc_array = [] test_loss_array = [] predict_array = [] # ====== Binary id_data, x_data, y_data = data_util.get_poor_god( 'wholeset_Jim_nomissing_validated.csv') for index, (train, test) in enumerate(kfold.split(x_data, y_data)): x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input( x_data.iloc[train]) history, model = cnn_binary(data_util.scale(x_train_cnn), to_categorical(y_data.iloc[train]), parameter) history_array.append(history) x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input( x_data.iloc[test]) x_test_cnn = np.expand_dims(data_util.scale(x_test_cnn), 2) loss, acc = model.evaluate(x_test_cnn, to_categorical(y_data.iloc[test]), verbose=0) test_acc_array.append(acc) test_loss_array.append(loss) y_pred = model.predict(x_test_cnn) # predict_array.append(y_pred, y_data.iloc[test])