def get_pca_model(n_c): global pca, X_train_pca, X_imp_test, X_test_std, X_test_pca, Y_pred, row pca = PCA(n_components=n_c, random_state=42) X_train_pca = pca.fit_transform(X_train_std) X_imp_test = imp.transform(X_test) X_test_std = std.fit_transform(X_imp_test) X_test_pca = pca.fit_transform(X_test_std) modelAfterPCA.fit(X_train_pca, Y_train) Y_pred = modelAfterPCA.predict(X_test_pca) row = get_conclusion(Y_pred, Y_test, "pca" + str(n_c))
def pca_model_smote(n_c): global pca, X_train_pca, X_imp_test, X_test_std, X_test_pca, Y_pred, row pca = PCA(n_components=n_c, random_state=42) X_train_pca = pca.fit_transform(X_train_std) X_imp_test = imp.transform(X_test) X_test_std = std.fit_transform(X_imp_test) X_test_pca = pca.fit_transform(X_test_std) sm = over_sampling.SMOTE() X_train_sampled, Y_train_sampled = sm.fit_sample(X_train_pca, Y_train) modelAfterPCA.fit(X_train_sampled, Y_train_sampled) Y_pred = modelAfterPCA.predict(X_test_pca) row = get_conclusion(Y_pred, Y_test, "smote-pca" + str(n_c)) return row
def runByImputer(X_train, Y_train, X_test, Y_test, prefix): print("Start ", prefix) imp = SimpleImputer(missing_values=np.nan, strategy=prefix) rf_ = [] basicRF = RandomForestClassifier(n_estimators=100) basicRF.name = "basic" rf_.append(basicRF) tunnedRF = RandomForestClassifier(max_depth=None, n_estimators=311, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', bootstrap=False, random_state=0) tunnedRF.name = "tuned" rf_.append(tunnedRF) # {'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': False} tunnedRFScoring = RandomForestClassifier(max_depth=30, n_estimators=600, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', bootstrap=False, random_state=0) tunnedRFScoring.name = "tuned-scoring" rf_.append(tunnedRFScoring) # model_full_rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=0, n_jobs=-1) # model_full_rf.name = "article" # rf_.append(model_full_rf) # rf_ = [basicRF, tunnedRF, tunnedRFlog] for index, model in enumerate(rf_): imp.fit(X_train) X_imp_train = imp.transform(X_train) print("Get conclustion for ", model.name) row = {} model.fit(X_imp_train, Y_train) imp.fit(X_test) X_imp_test = imp.transform(X_test) Y_pred = model.predict(X_imp_test) print("ask conconclustion") row = get_conclusion(Y_test, Y_pred, prefix + '_' + model.name) conclusion.append(row)
from xgboost import XGBClassifier from sklearn.metrics import confusion_matrix, classification_report, accuracy_score from lib.read import cost_confusion_matrix, read_data, get_conclusion X_train, Y_train, X_test, Y_test = read_data() conclusions = [] basic = XGBClassifier() basic.name = "basic" basic.fit(X_train, Y_train) Y_pred = basic.predict(X_test) cm = confusion_matrix(Y_test, Y_pred) cost_confusion_matrix(cm) row = get_conclusion(Y_test, Y_pred, 'basic') # "{'subsample': 0.9, 'silent': False, 'reg_lambda': 10.0, 'n_estimators': 100, 'min_child_weight': 0.5, 'max_depth': 10, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.4} bestParamsModel = XGBClassifier(subsample=0.9, silent=False, reg_lambda=10, n_estimators=100, min_child_weight=0.5, max_depth=10, learning_rate=0.2, gamma=0, colsample_bytree=0.7, colsample_bylevel=0.4) bestParamsModel.name = "bestParamsModel" bestParamsModel.fit(X_train, Y_train) Y_pred = bestParamsModel.predict(X_test)