def justRF_temp(X, y, feature_list): model = models.RF(feature_list) model1 = models.RF(feature_list) model1.fit(X,y) X_tr, _ = model1.updateList(X) print(X_tr.shape) score = cross_val_score(model, X, y, cv=StratifiedKFold(n_splits=5, shuffle=True)) print("Final sore with just RF is {:.2f}%".format(score.mean() * 100))
def execute_with_algorithm(alg, X, y, fname, headers, out_dir, record_id, feature_selection, oversampling, survival, undersampling): '''execute learning task using the specified algorithm''' # feature selection # if survival == True and aggregation == True: # k=150 # if survival == True and aggregation == False: # k=220 # if survival == False and aggregation == True: # k=150 # if survival == False and aggregation == False: # k=220 k=220 # perform feature selection new_X, best_features, headers = fs.pearson_fs(X, y, headers, k, feature_selection, survival) # execute algorithm if alg == 'DT': results, model = ML.CART(new_X, y, best_features, out_dir+"{}.dot".format(fname), headers, oversampling, undersampling) #out_dir+"{}.dot".format(fname) elif alg == 'RF': results, features, model = ML.RF(new_X, y, best_features,oversampling, undersampling, n_estimators=200) elif alg == 'RFsmall': results, features, model = ML.RF(new_X, y, best_features, oversampling, undersampling, n_estimators=100) elif alg == 'SVM': results, model = ML.SVM(new_X, y, best_features, oversampling, undersampling) elif alg == 'LR': results, features, model = ML.LR(new_X, y, best_features,oversampling, undersampling) elif alg == 'XGBoost': results, features, model = ML.XGBoost(new_X, y, best_features,oversampling, undersampling) if alg == 'COX': results, features, model = ML.COX(new_X, y, best_features, oversampling, undersampling) if alg == 'survSVM': results, features, model = ML.survSVM(new_X, y, best_features, oversampling, undersampling) if alg == 'GBS': results, features, model = ML.GradientBoostingSurvival(new_X, y, best_features, oversampling, undersampling) if not results: return if survival == False: in_out.save_results(out_dir+fname+'.csv', ["fpr", "tpr", "auc", "cm"], results, [sum(y),len(y)]) # else: # in_out.save_results(out_dir+fname+'.csv', ["CI"], results, [sum(y),len(y)]) if 'features' in locals(): features = features.flatten() in_out.save_features(out_dir+"features_" + fname + '.csv', zip(headers[1:-1], features)) return model, best_features, [fname] + results[0:3]
def runPostFiltering(X, y, feature_list): print("\nGetting accuracy of dataset after filtering these features\n") noRFE(X, y) model1 = models.RF(feature_list) model2 = models.nestedRFECV(feature_list) print("\nGetting the weights of these filtered features and checking the accuracy\n") score1 = cross_val_score(model1, X, y, cv=StratifiedKFold(n_splits=5, shuffle=True)) print("Final sore with just RF is {:.2f}%".format(score1.mean() * 100)) print("\nUsing nested cross-validation on these filtered features and checking the accuracy\n") model2.fit(X,y) X_tr = model2.transformed() clf = RandomForestClassifier(n_estimators=10, max_depth=20) score2 = cross_val_score(clf, X_tr, y, cv=StratifiedKFold(n_splits=5, shuffle=True)) print("Final sore with nested CV is {:.2f}%".format(score2.mean() * 100))
parameters_lr = {"C": [1, 3, 6, 9, 15, 20, 25, 30]} parameters_lin_svm = {"C": [1, 5, 10, 20, 40, 100, 1000]} """ Upsampling """ from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=8) X_res_train, y_res_train = sm.fit_sample(X_train, y_train) X_res_tr, y_res_tr = sm.fit_sample(X_tr, y_tr) X_res_val, y_res_val = sm.fit_sample(X_val, y_val) X_res_test, y_res_test = sm.fit_sample(X_test, y_test) print(pd.Series(y_res_train).value_counts()) print(pd.Series(y_res_val).value_counts()) rf = models.RF() rf_opt, rf_opt_params = modelling.find_hyperparams(rf, parameters_rf, X_res_train, y_res_train, \ search_method="randomized", \ n_iter = 50) rf_score = modelling.evaluation(rf_opt, X_test, y_test, rf_opt.predict(X_test)) rf_res_score = modelling.evaluation(rf_opt, X_res_test, y_res_test, rf_opt.predict(X_res_test)) from sklearn.ensemble import RandomForestClassifier ls = RandomForestClassifier(bootstrap=True, criterion="entropy", max_depth=10, max_features="sqrt", min_samples_leaf=2, min_samples_split=4, n_estimators=200)