def get_feature_names(): id_train_all, x_train_all, y_train_all = data_util.get_poor_god( 'training_is_0.csv', sub_class='ischemic') x_train_all.rename(columns={ 'MRS_TX_1': '30-day mRS', 'discharged_mrs': 'Discharge mRS', 'Toilet_use': 'Toilet use', 'Bowel_control': 'Bowel control', 'Bladder_control': 'Bladder control', 'TRMNG_FL': 'Nasogastric tube', 'TRMRE_FL': 'Rehab', 'OFFDT_ID_1': 'Discharge to Home', 'NIHS_6aL_out': 'Discharge NIHSS 6aL', 'NIHS_6aL_in': 'Admission NIHSS 6aL', 'NIHS_6bR_out': 'Discharge NIHSS 6bR', 'NIHS_10_out': 'Discharge NIHSS 10', 'NIHS_5aL_out': 'Discharge NIHSS 5aL', 'NIHS_5bR_out': 'Discharge NIHSS 5bR', 'NIHS_1b_out': 'Discharge NIHSS 1b', 'NIHS_9_out': 'Discharge NIHSS 9', 'NIHS_5aL_in': 'Admission NIHSS 5aL', 'NIHS_1b_in': 'Admission NIHSS 1b' }, inplace=True) return x_train_all.columns.values
test_loss_array = [] predict_array = [] # ====== Multi-classes # x_data, y_data = data_util.get_individual('wholeset_Jim_nomissing_validated.csv') # for index, (train, test) in enumerate(kfold.split(x_data, y_data)): # history, model = mlp_multi(data_util.scale(x_data.iloc[train]), # to_categorical(y_data.iloc[train]), # parameter) # history_array.append(history) # loss, acc = model.evaluate(data_util.scale(x_data.iloc[test]), # to_categorical(y_data.iloc[test]), # verbose=0) # test_acc_array.append(acc) # test_loss_array.append(loss) # ====== Binary id_data, x_data, y_data = data_util.get_poor_god( 'wholeset_Jim_nomissing_validated.csv') for index, (train, test) in enumerate(kfold.split(x_data, y_data)): x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input( x_data.iloc[train]) # x_train = data_util.kera_feature(x_data.iloc[train]) history, model = mlp_binary(data_util.scale(x_train_cnn), to_categorical(y_data.iloc[train]), parameter) history_array.append(history) x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input( x_data.iloc[test]) # x_test = data_util.kera_feature(x_data.iloc[test]) loss, acc = model.evaluate(data_util.scale(x_test_cnn), to_categorical(y_data.iloc[test]), verbose=0)
import numpy as np import pandas as pd import os from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import StratifiedKFold from my_utils import data_util if __name__ == '__main__': subtype = 'he' # hold_out_round = 1 for hold_out_round in range(0, 10, 1): if subtype == 'is': sub_class = 'ischemic' else: sub_class = 'hemorrhagic' id_train_all, x_train_all, y_train_all = data_util.get_poor_god('training_' + subtype + '_' + str(hold_out_round) + '.csv', sub_class=sub_class) feature_names = x_train_all.columns.values forest = ExtraTreesClassifier() kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=hold_out_round) for index, (train, test) in enumerate(kfold.split(x_train_all, y_train_all)): x_train = data_util.scale(x_train_all.iloc[train]) y_train = y_train_all.iloc[train] forest.fit(x_train, y_train) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking imp = [] print("Feature ranking:") for i in range(x_train.shape[1]): print("%d. feature %d (%f) %s" % (i + 1, indices[i], importances[indices[i]], feature_names[indices[i]]))
import pandas as pd import os import numpy as np import matplotlib.pyplot as plt #Data visualisation libraries import seaborn as sns from sklearn.linear_model import LogisticRegression from my_utils import data_util, performance_util from sklearn.metrics import roc_auc_score id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample( 'training_is_9.csv', sub_class='ischemic') id_hold, x_hold, y_hold = data_util.get_poor_god('hold_is_9.csv', sub_class='ischemic') lm = LogisticRegression() x = x_train_all[['MRS_TX_1']] y = y_train_all lm.fit(x, y) test_x = x_hold[['MRS_TX_1']] predictions = lm.predict(test_x) logit_roc_auc = roc_auc_score(y_hold, lm.predict(test_x)) print(logit_roc_auc)
def do_svm(hold_out_round, sub_class, experiment): np.random.seed(hold_out_round) if sub_class == 'ischemic': id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample( 'training_is_' + str(hold_out_round) + '.csv', sub_class=sub_class) id_hold, x_hold, y_hold = data_util.get_poor_god( 'hold_is_' + str(hold_out_round) + '.csv', sub_class=sub_class) else: id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample( 'training_he_' + str(hold_out_round) + '.csv', sub_class=sub_class) id_hold, x_hold, y_hold = data_util.get_poor_god( 'hold_he_' + str(hold_out_round) + '.csv', sub_class=sub_class) # if experiment == 0: save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'all' + os.sep model_name = 'svm_' + sub_class + '_h_' + str(hold_out_round) elif experiment == 1: x_train_all = data_util.feature_selection(x_train_all, sub_class) x_hold = data_util.feature_selection(x_hold, sub_class) save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'fs' + os.sep model_name = 'svm_fs_' + sub_class + '_h_' + str(hold_out_round) elif experiment == 2: x_train_all = x_train_all.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_hold = x_hold.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'all_nf' + os.sep model_name = 'svm_nf_' + sub_class + '_h_' + str(hold_out_round) else: x_train_all = data_util.feature_selection(x_train_all, sub_class) x_train_all = x_train_all.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_hold = data_util.feature_selection(x_hold, sub_class) x_hold = x_hold.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'fs_nf' + os.sep model_name = 'svm_fs_nf_' + sub_class + '_h_' + str(hold_out_round) # test_acc_array = [] kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=hold_out_round) classifier = SVC(kernel='linear', probability=True, random_state=hold_out_round, verbose=True) for index, (train, test) in enumerate(kfold.split(x_train_all, y_train_all)): # Training x_train = data_util.scale(x_train_all.iloc[train]) y_train = y_train_all.iloc[train] # Testing x_test = data_util.scale(x_train_all.iloc[test]) y_test = y_train_all.iloc[test] # train on 90% training classifier.fit(x_train, y_train) predict_result_train = id_train_all.iloc[train] train_probas = classifier.predict_proba(x_train) predict_result_train['label'] = y_train predict_result_train['0'] = train_probas[:, 0] predict_result_train['1'] = train_probas[:, 1] predict_result_train.to_csv(save_path + model_name + '_train_cv' + str(index) + '.csv', sep=',', encoding='utf-8') # Evaluation on 10% training predict_result_test = id_train_all.iloc[test] test_probas = classifier.predict_proba(x_test) predict_result_test['label'] = y_test predict_result_test['0'] = test_probas[:, 0] predict_result_test['1'] = test_probas[:, 1] predict_result_test.to_csv(save_path + model_name + '_test_cv' + str(index) + '.csv', sep=',', encoding='utf-8') test_acc = accuracy_score(y_test, classifier.predict(x_test)) test_acc_array.append(test_acc) performance_util.save_model(classifier, model_name + '_' + str(index)) print('10-CV Done') # -- best_model_inx = test_acc_array.index(max(test_acc_array)) hold_model = performance_util.load_ml_model(model_name, best_model_inx) x_hold = data_util.scale(x_hold) predict_result_hold = id_hold holdout_probas = hold_model.predict_proba(x_hold) predict_result_hold['label'] = y_hold predict_result_hold['0'] = holdout_probas[:, 0] predict_result_hold['1'] = holdout_probas[:, 1] predict_result_hold.to_csv(save_path + model_name + '_hold.csv', sep=',', encoding='utf-8') print('hold-out Done')
def do_mlp_cnn(hold_out_round, sub_class, experiment): np.random.seed(hold_out_round) if sub_class == 'ischemic': id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample( 'training_is_' + str(hold_out_round) + '.csv', sub_class=sub_class) id_hold, x_hold, y_hold = data_util.get_poor_god( 'hold_is_' + str(hold_out_round) + '.csv', sub_class=sub_class) else: id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample( 'training_he_' + str(hold_out_round) + '.csv', sub_class=sub_class) id_hold, x_hold, y_hold = data_util.get_poor_god( 'hold_he_' + str(hold_out_round) + '.csv', sub_class=sub_class) # if experiment == 0: save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'all' + os.sep parameter = { 'model_name': 'mlp_cnn_' + sub_class + '_h_' + str(hold_out_round), 'size_of_batch': 56, 'nb_epoch': 150, 'drop_rate': 0.5 } elif experiment == 1: save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'fs' + os.sep selected_features = data_util.get_selected_feature_name(sub_class) parameter = { 'model_name': 'mlp_cnn_fs_' + sub_class + '_h_' + str(hold_out_round), 'size_of_batch': 56, 'nb_epoch': 150, 'drop_rate': 0.5 } elif experiment == 2: save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'all_nf' + os.sep parameter = { 'model_name': 'mlp_cnn_nf_' + sub_class + '_h_' + str(hold_out_round), 'size_of_batch': 56, 'nb_epoch': 150, 'drop_rate': 0.5 } else: save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'fs_nf' + os.sep selected_features = data_util.get_selected_feature_name(sub_class) parameter = { 'model_name': 'mlp_cnn_fs_nf_' + sub_class + '_h_' + str(hold_out_round), 'size_of_batch': 56, 'nb_epoch': 150, 'drop_rate': 0.5 } test_acc_array = [] test_loss_array = [] kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=hold_out_round) for index, (train, test) in enumerate(kfold.split(x_train_all, y_train_all)): # training x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input( x_train_all.iloc[train]) if experiment == 2: x_train_cnn = x_train_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_train_mlp = x_train_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) if experiment == 1 or experiment == 3: x_train_cnn, x_train_mlp = data_util.selected_cnn_mlp_input( x_train_cnn, x_train_mlp, selected_features) if experiment == 3: x_train_cnn = x_train_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_train_mlp = x_train_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_train_cnn = data_util.scale(x_train_cnn) x_train_mlp = data_util.scale(x_train_mlp) y_train = y_train_all.iloc[train] # Testing x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input( x_train_all.iloc[test]) if experiment == 2: x_test_cnn = x_test_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_test_mlp = x_test_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) if experiment == 1 or experiment == 3: x_test_cnn, x_test_mlp = data_util.selected_cnn_mlp_input( x_test_cnn, x_test_mlp, selected_features) if experiment == 3: x_test_cnn = x_test_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_test_mlp = x_test_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_test_cnn = np.expand_dims(data_util.scale(x_test_cnn), 2) x_test_mlp = data_util.scale(x_test_mlp) y_test = y_train_all.iloc[test] # train on 90% training history, model = mlp_cnn_binary(x_train_cnn, x_train_mlp, to_categorical(y_train), parameter, index) performance_util.save_train_validation( save_path + parameter['model_name'], history, 'acc', str(index)) predict_result_train = id_train_all.iloc[train] x_train_cnn = np.expand_dims(x_train_cnn, 2) train_probas = model.predict([x_train_cnn, x_train_mlp]) predict_result_train['label'] = y_train predict_result_train['0'] = train_probas[:, 0] predict_result_train['1'] = train_probas[:, 1] predict_result_train.to_csv(save_path + parameter['model_name'] + '_train_cv' + str(index) + '.csv', sep=',', encoding='utf-8') # Evaluation on 10% training predict_result_test = id_train_all.iloc[test] test_probas = model.predict([x_test_cnn, x_test_mlp]) predict_result_test['label'] = y_test predict_result_test['0'] = test_probas[:, 0] predict_result_test['1'] = test_probas[:, 1] predict_result_test.to_csv(save_path + parameter['model_name'] + '_test_cv' + str(index) + '.csv', sep=',', encoding='utf-8') loss, acc = model.evaluate([x_test_cnn, x_test_mlp], to_categorical(y_test), verbose=0) test_acc_array.append(acc) test_loss_array.append(loss) # plot_fig.plot_acc_loss(history, 'acc') performance_util.save_test(save_path + parameter['model_name'], test_acc_array, test_loss_array) print('10-CV Done') # -- best_model_inx = test_acc_array.index(max(test_acc_array)) hold_model = performance_util.load_nn_model(parameter['model_name'], best_model_inx) x_hold_cnn, x_hold_mlp = data_util.split_cnn_mlp_input(x_hold) if experiment == 2: x_hold_cnn = x_hold_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_hold_mlp = x_hold_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) if experiment == 1 or experiment == 3: x_hold_cnn, x_hold_mlp = data_util.selected_cnn_mlp_input( x_hold_cnn, x_hold_mlp, selected_features) if experiment == 3: x_hold_cnn = x_hold_cnn.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_hold_mlp = x_hold_mlp.drop([ 'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1' ], errors='ignore', axis=1) x_hold_cnn = np.expand_dims(data_util.scale(x_hold_cnn), 2) x_hold_mlp = data_util.scale(x_hold_mlp) predict_result_hold = id_hold holdout_probas = hold_model.predict([x_hold_cnn, x_hold_mlp]) predict_result_hold['label'] = y_hold predict_result_hold['0'] = holdout_probas[:, 0] predict_result_hold['1'] = holdout_probas[:, 1] predict_result_hold.to_csv(save_path + parameter['model_name'] + '_hold.csv', sep=',', encoding='utf-8') print('hold-out Done')
else: all_selected_features = np.append(all_selected_features, selected_f_names) feature_dict = Counter(all_selected_features) # use to draw heatmap return list(feature_dict.keys()) if __name__ == '__main__': # Just get the feature names subtype = 'he' if subtype == 'is': sub_class = 'ischemic' else: sub_class = 'hemorrhagic' id_train_all, x_train_all, y_train_all = data_util.get_poor_god( 'training_' + subtype + '_0.csv', sub_class=sub_class) feature_names = x_train_all.columns.values # for hold_out_round in range(0, 10, 1): if subtype == 'is': sub_class = 'ischemic' else: sub_class = 'hemorrhagic' df = pd.read_csv('f_' + subtype + '_' + str(hold_out_round) + '.csv', encoding='utf8') mean_importance = df.drop(['f_index'], axis=1).mean(axis=1) if hold_out_round == 0: robust_f_df = pd.DataFrame(data={'f_index': df['f_index']}) robust_f_df['rf' + str(hold_out_round)] = mean_importance else: robust_f_df['rf' + str(hold_out_round)] = mean_importance