def test_fit_sample_weight(): """Check that a warning is raised if sample_weights is passed to fit().""" logitboost = LogitBoost() with pytest.warns(RuntimeWarning): logitboost.fit(X_simple, y_simple_binary, sample_weight=np.ones(len(X_simple)))
def test_bad_base_estimator(): """Tests for errors raised when the base estimator is bad.""" # LogitBoost base estimators should be regressors, not classifiers base_estimator = DecisionTreeClassifier() # Validation is done at fitting, not at initialization logitboost = LogitBoost(base_estimator) with pytest.raises(ValueError): logitboost.fit(X_simple, y_simple_binary)
def boost_elasticnet(X_train, X_test, y_train, y_test): # applying bagging to logistic regression with elasticnet # Args: # X_train, X_test, y_train, y_test # Returns: # DataFrame: Preprocessed DataFrame. where Alpha and L1 ratio are hyperparameters of elastic net, estimator is hyperparameter for bagging, confusion matrix is the confusion matrix for each combination of those hyperparameters df = pd.DataFrame( columns=['Estimators', 'Learning Rate', 'Confusion Matrix']) rows = [] alphas = [0.0001, 0.001, 0.01] #,0.1,1] estimators = [50, 100, 150] rates = [0.5, 0.75, 1] # for al in alphas: # estimator = SGDClassifier(loss = 'log',alpha= al,penalty = 'l1',random_state=0) for n_est in estimators: for rate in rates: ada = LogitBoost(n_estimators=n_est, learning_rate=rate, random_state=0) #algorithm='SAMME', ada.fit(X_train, y_train) predicted_labels = ada.predict(X_test) tn, fp, fn, tp = confusion_matrix(y_test, predicted_labels, labels=[0, 1]).ravel() convert_matrix = [tn, fp, fn, tp] rows.append([n_est, rate, convert_matrix]) for i in range(len(rows)): df = df.append( { 'Estimators': rows[i][0], 'Learning Rate': rows[i][1], 'Confusion Matrix': rows[i][2] }, ignore_index=True) return df
label="malignant", edgecolor="k", alpha=0.7, ) plt.title("t-SNE plot of the training data") plt.xlabel("1st embedding axis") plt.ylabel("2nd embedding axis") plt.legend(loc="best", frameon=True, shadow=True) plt.tight_layout() plt.show() plt.close() lboost = LogitBoost(base_estimator=LogisticRegression(), n_estimators=200, random_state=0) lboost.fit(X_train, y_train) y_pred_train = lboost.predict(X_train) y_pred_test = lboost.predict(X_test) accuracy_train = accuracy_score(y_train, y_pred_train) accuracy_test = accuracy_score(y_test, y_pred_test) print("Training accuracy: %.4f" % accuracy_train) print("Test accuracy: %.4f" % accuracy_test) report_train = classification_report(y_train, y_pred_train) report_test = classification_report(y_test, y_pred_test) print("Training\n%s" % report_train)
def _toy_dataset_test(load_func, test_size=(1. / 3), random_state=0, min_score_train=0.9, min_score_test=0.9): """Create a classification unit test from a scikit-learn toy dataset.""" # Fetch the dataset data = load_func() X = data.data y = data.target_names[data.target] # Distinct classes classes = data.target_names n_classes = len(classes) # Binary/multiclass classification indicator is_binary = (n_classes == 2) # Shuffle data and split it into training/testing samples X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=random_state) for bootstrap in (True, False): # Fit a LogitBoost model logitboost = LogitBoost(bootstrap=bootstrap, random_state=random_state) logitboost.fit(X_train, y_train) # Compute accuracy scores and assert minimum accuracy score_train = logitboost.score(X_train, y_train) score_test = logitboost.score(X_test, y_test) assert score_train >= min_score_train, \ ("Failed with bootstrap=%s: training score %.3f less than %.3f" % (bootstrap, score_train, min_score_train)) assert score_test >= min_score_test, \ ("Failed with bootstrap=%s: testing score %.3f less than %.3f" % (bootstrap, score_test, min_score_test)) # Get probabilities and the decision function predict_proba = logitboost.predict_proba(X_test) decision_function = logitboost.decision_function(X_test) # predict_proba() should always return (n_samples, n_classes) assert predict_proba.shape == (X_test.shape[0], n_classes) # decision_function() shape depends on the classification task if is_binary: assert decision_function.shape == (X_test.shape[0], ) else: assert decision_function.shape == (X_test.shape[0], n_classes) # Check that the last item of a staged method is the same as a regular # method staged_predict = np.asarray(list(logitboost.staged_predict(X_test))) staged_predict_proba = \ np.asarray(list(logitboost.staged_predict_proba(X_test))) staged_decision_function = \ np.asarray(list(logitboost.staged_decision_function(X_test))) staged_score = \ np.asarray(list(logitboost.staged_score(X_test, y_test))) np.testing.assert_equal(staged_predict[-1], logitboost.predict(X_test)) np.testing.assert_almost_equal(staged_predict_proba[-1], logitboost.predict_proba(X_test)) np.testing.assert_almost_equal(staged_decision_function[-1], logitboost.decision_function(X_test)) np.testing.assert_almost_equal(staged_score[-1], logitboost.score(X_test, y_test)) # contributions() should return one non-negative number for each # estimator in the ensemble contrib = logitboost.contributions(X_train) assert contrib.shape == (logitboost.n_estimators, ) assert np.all(contrib >= 0)
def test_sklearn_api(): """Make sure LogitBoost is minimally compliant with scikit-learn's API.""" check_estimator(LogitBoost())
def test_feature_importances_(): """Check that the feature_importances_ attribute behaves as expected.""" # DecisionTreeRegressor supports feature_importances_ logitboost = LogitBoost(DecisionTreeRegressor()) # Binary classification should work logitboost.fit(X_simple, y_simple_binary) assert logitboost.feature_importances_.shape == (np.shape(X_simple)[1], ) # Multiclass classification should currently fail logitboost.fit(X_simple, y_simple_multiclass) with pytest.raises(NotImplementedError): _ = logitboost.feature_importances_ # Ridge doesn't support feature_importances_ logitboost = LogitBoost(Ridge()) # Even binary classification shouldn't work logitboost.fit(X_simple, y_simple_binary) with pytest.raises(AttributeError): _ = logitboost.feature_importances_ # Check that the feature_importance_ attribute identifies bad features X, y = load_breast_cancer(return_X_y=True) # Add a useless constant feature columns to X: it should be the least # important X = np.column_stack((X, np.zeros(len(X)))) logitboost = LogitBoost(random_state=0) logitboost.fit(X, y) feature_importances = logitboost.feature_importances_ dummy_importance = feature_importances[-1] assert dummy_importance == min(feature_importances)
class_weights = compute_class_weight('balanced', np.unique(training_targets), training_targets[target_label[0]]) class_weights = dict(enumerate(class_weights)) #%% define models and parameters # define models models = { 'ExtraTreesClassifier': ExtraTreesClassifier(), 'RandomForestClassifier': RandomForestClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'SVC': SVC(), 'LogitBoost': LogitBoost(), 'XGBClassifier': XGBClassifier(), 'ComplementNB': ComplementNB() } # define model parameters for parameter search param_extra_trees = { 'n_estimators': [10, 50, 100, 200, 300], 'min_samples_split': [2, 4], 'max_features': ['sqrt', None], 'random_state': [random_state], 'class_weight': [class_weights] } param_random_forest = {
gamma=0.0, # 惩罚项中叶子结点个数前的参数 subsample=1, # 所有样本建立决策树 colsample_btree=1, # 所有特征建立决策树 scale_pos_weight=1, # 解决样本个数不平衡的问题 random_state=27, # 随机数 slient=0, ) xg.fit(feature_train_balance, label_train_balance) xg_pred = xg.predict_proba(feature_test_balance)[:, 1] xg_evaluation = valid.evaluate( label_test_balance, xg_pred, save_path="../data/xg_evaluation.json" ) plot_evaluation(label_test_balance, xg_pred, "../figure", method="XG") #%% lb = LogitBoost(n_estimators=200, random_state=0) # base_estimator=LogisticRegression() lb.fit(feature_train_balance, label_train_balance) lb_pred = lb.predict_proba(feature_test_balance)[:, 1] lb_evaluation = evaluate( label_test_balance, lb_pred, save_path="../data/lb_evaluation.json" ) plot_evaluation(label_test_balance, lb_pred, "../figure", method="LB") #%% from feature import valid #%% # Auto-tunan_column_listne model for pandemic # 1. XGboost # 2. XGboost - additive learning # 3. LogisticBoosting - additive learning # 4. dummy Logistic
plt.scatter(X_train_tsne[mask_malignant, 0], X_train_tsne[mask_malignant, 1], marker='o', c='r', label='Cancer maligno', edgecolor='k', alpha=0.7) plt.title('Gráfico t-SNE dos dados de treinamento') plt.xlabel('1st embedding axis') plt.ylabel('2nd embedding axis') plt.legend(loc='best', frameon=True, shadow=True) plt.tight_layout() plt.show() #carrega o nosso algoritmo de classificação baseada e árvore de decisão (árvores de decisão com profundidade 1, lboost = LogitBoost(n_estimators=200, random_state=0) # realiza o treinamento dos dados lboost.fit(X_train, y_train) # faz a validação dos dados de treino e teste (tenta prevê se o cancer é maligno ou benigno) y_pred_train = lboost.predict(X_train) y_pred_test = lboost.predict(X_test) # calcula a porcentagem de acurácia accuracy_train = (accuracy_score(y_train, y_pred_train) * 100) accuracy_test = (accuracy_score(y_test, y_pred_test) * 100) print('Training accuracy: %.1f' % accuracy_train) print('Test accuracy: %.1f' % accuracy_test)
def main(plot=True, M=8, n_fold=10): """ :param plot: whether to plot the train and test result :param M: maximum T to search would be 100*M :param n_fold: number of folds in cross validation :return: None """ train_file, test_file = "datasets/abalone_train_scaled.txt", "datasets/abalone_test_scaled.txt" train = sparse.csr_matrix(np.loadtxt(train_file, delimiter=",")) test = sparse.csr_matrix(np.loadtxt(test_file, delimiter=",")) m = train.shape[0] #3133 x_dim = train.shape[1] - 1 #10 x_train, y_train = train[:, :x_dim].toarray(), train[:, x_dim].toarray().squeeze() x_test, y_test = test[:, :x_dim].toarray(), test[:, x_dim].toarray().squeeze() #print(x_train.shape, y_train.shape, x_test.shape) #print(x_train) aboost_train_cverror = list(np.ones(M)) lboost_train_cverror = list(np.ones(M)) for multiple in range(1, M+1): T = multiple * 100 print("\nT = %s\t" % T) # Set AdaBoost parameters # decision stump is the default base estimator aboost = AdaBoostClassifier(n_estimators=T, random_state=0) # Set LogitBoost parameters lboost = LogitBoost(n_estimators=T, random_state=0) # get 10-fold cross validation error aboost_cv_results = cross_validate(aboost, x_train, y_train, cv=n_fold) lboost_cv_results = cross_validate(lboost, x_train, y_train, cv=n_fold) # compute error by 1 - accuracy aboost_train_cverror[multiple-1] = 1 - aboost_cv_results['test_score'] lboost_train_cverror[multiple-1] = 1 - lboost_cv_results['test_score'] aboost_train_cverror = np.stack(aboost_train_cverror) lboost_train_cverror = np.stack(lboost_train_cverror) print(aboost_train_cverror) print(lboost_train_cverror) # find the T that gives least error (the best cross-validation accuracy) a_train_cverror_mean, a_train_cverror_std = aboost_train_cverror.mean(axis=1), aboost_train_cverror.std(axis=1) argmin = a_train_cverror_mean.flatten().argmin() best_T_aboost = int(argmin+1) * 100 print("----------------------\n",\ "AdaBoost iteration number T = %s\n"%(best_T_aboost), \ "----------------------\n") # find the T that gives least error (the best cross-validation accuracy) l_train_cverror_mean, l_train_cverror_std = lboost_train_cverror.mean(axis=1), lboost_train_cverror.std(axis=1) argmin = l_train_cverror_mean.flatten().argmin() best_T_lboost = int(argmin + 1) * 100 print("----------------------\n", \ "LogitBoost iteration number T = %s\n" % (best_T_lboost), \ "----------------------\n") print('Now train with the best T=T* and eval on the test set\n') # Train on the whole train set aboost = AdaBoostClassifier(n_estimators=best_T_aboost, random_state=0) aboost.fit(x_train, y_train) lboost = LogitBoost(n_estimators=best_T_lboost, random_state=0) lboost.fit(x_train, y_train) # Test on the test set y_pred_train = aboost.predict(x_train) y_pred_test = aboost.predict(x_test) a_error_train = 1-accuracy_score(y_train, y_pred_train) a_error_test = 1-accuracy_score(y_test, y_pred_test) print("AdaBoost train error: %s test error: %s" % (a_error_train, a_error_test)) y_pred_train = lboost.predict(x_train) y_pred_test = lboost.predict(x_test) l_error_train = 1-accuracy_score(y_train, y_pred_train) l_error_test = 1-accuracy_score(y_test, y_pred_test) print("LogitBoost train error: %s test error: %s"%(l_error_train, l_error_test)) if plot: plt.figure() x_values = range(100, M*100+1, 100) plt.plot(x_values, a_train_cverror_mean, label="AdaBoost") plt.fill_between(x_values, a_train_cverror_mean + a_train_cverror_std, a_train_cverror_mean - a_train_cverror_std, alpha=0.5, edgecolor='blue', facecolor='blue') plt.plot(x_values, l_train_cverror_mean, label="LogitBoost") plt.fill_between(x_values, l_train_cverror_mean + l_train_cverror_std, l_train_cverror_mean - l_train_cverror_std, alpha=0.5, edgecolor='#FF9848', facecolor='#FF9848') plt.xlabel('T (number of iterations/classifiers)') plt.ylabel('10fold cross validation train error') plt.legend() plt.ylim(0, 0.5) plt.savefig('B.i_cverror.png')
def ensemble_comp(X_train, X_test, y_train, y_test, title): clf1 = XGBClassifier(learning_rate=0.01, max_depth=3, n_estimators=700, random_state=8) clf2 = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, max_features='log2', min_samples_leaf=4, n_estimators=280, subsample=0.25, random_state=8) clf3 = RandomForestClassifier(n_estimators=300, max_depth=3, verbose=1, random_state=8) clf4 = SVC(kernel='poly', probability=True, verbose=1, random_state=8) clf5 = KNeighborsClassifier(n_neighbors=3) clf6 = MLPClassifier(hidden_layer_sizes=(80, ), activation='logistic', learning_rate_init=0.01, verbose=1) clf7 = AdaBoostClassifier(n_estimators=300, learning_rate=0.01, random_state=8) clf8 = LogitBoost(n_estimators=300, learning_rate=0.01, random_state=8) complete_voting_model = VotingClassifier(estimators=[('xgb', clf1), ('gb', clf2), ('rf', clf3), ('svm', clf4), ('knn', clf5), ('elm', clf6), ('ada', clf7), ('logit', clf8)], voting='soft') new_voting_model = VotingClassifier(estimators=[ ('svm', clf4), ('knn', clf5), ('elm', clf6), ('ada', clf7), ('logit', clf8) ], voting='soft') new_voting_model_without_logit = VotingClassifier(estimators=[ ('svm', clf4), ('knn', clf5), ('elm', clf6), ('ada', clf7) ], voting='soft') original_voting_model = VotingClassifier(estimators=[('xgb', clf1), ('gb', clf2), ('rf', clf3)], voting='soft') complete_voting_model.fit(X_train, y_train) new_voting_model.fit(X_train, y_train) new_voting_model_without_logit.fit(X_train, y_train) original_voting_model.fit(X_train, y_train) p_complete_voting = complete_voting_model.predict_proba(X_test) p_new_voting = new_voting_model.predict_proba(X_test) p_new_voting_without_logit = new_voting_model_without_logit.predict_proba( X_test) p_original_voting = original_voting_model.predict_proba(X_test) complete_voting_ll = log_loss(y_test, p_complete_voting) new_voting_ll = log_loss(y_test, p_new_voting) new_voting_without_logit_ll = log_loss(y_test, p_new_voting_without_logit) original_voting_ll = log_loss(y_test, p_original_voting) #reset models so that overfitting doesn't occur complete_voting_model = VotingClassifier(estimators=[('xgb', clf1), ('gb', clf2), ('rf', clf3), ('svm', clf4), ('knn', clf5), ('elm', clf6), ('ada', clf7), ('logit', clf8)], voting='soft') new_voting_model = VotingClassifier(estimators=[ ('svm', clf4), ('knn', clf5), ('elm', clf6), ('ada', clf7), ('logit', clf8) ], voting='soft') new_voting_model_without_logit = VotingClassifier(estimators=[ ('svm', clf4), ('knn', clf5), ('elm', clf6), ('ada', clf7) ], voting='soft') original_voting_model = VotingClassifier(estimators=[('xgb', clf1), ('gb', clf2), ('rf', clf3)], voting='soft') X = np.concatenate((X_train, X_test), axis=0) y = np.concatenate((y_train, y_test)) complete_voting_accuracy = np.mean( cross_val_score(complete_voting_model, X, y, cv=5)) new_voting_accuracy = np.mean(cross_val_score(new_voting_model, X, y, cv=5)) new_voting_without_logit_accuracy = np.mean( cross_val_score(new_voting_model_without_logit, X, y, cv=5)) original_voting_accuracy = np.mean( cross_val_score(original_voting_model, X, y, cv=5)) titlestr = "\n" + title str2 = "\nTotal Ensemble Log Loss " + str(complete_voting_ll) str3 = "\nNew Ensemble Log Loss " + str(new_voting_ll) str4 = "\nNew Ensemble without LogitBoost Log Loss " + str( new_voting_without_logit_ll) str5 = "\nOriginal Ensemble (Gradient Boost, Random Forest, and XGBoost) Log Loss " + str( original_voting_ll) str6 = "\n\nTotal Ensemble Mean Cross Fold Accuracy " + str( complete_voting_accuracy) str7 = "\nNew Ensemble Mean Cross Fold Accuracy " + str( new_voting_accuracy) str8 = "\nNew Ensemble without LogitBoost Cross Mean Fold Accuracy " + str( new_voting_without_logit_accuracy) str9 = "\nOriginal Ensemble (Gradient Boost, Random Forest, and XGBoost) Mean Accuracy " + str( original_voting_accuracy) lenstr = "\nAverage size of fold: " + str(len(y) / 5) printstr = titlestr + str2 + str3 + str4 + str5 + str6 + str7 + str8 + str9 + lenstr print(printstr) write_to_file(printstr)
def model_comp(X_train, X_test, y_train, y_test, title=""): xgboost_model = XGBClassifier(learning_rate=0.01, max_depth=3, n_estimators=700, random_state=8) gradient_boost_model = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, max_features='log2', min_samples_leaf=4, n_estimators=280, subsample=0.25, random_state=8) random_forest_model = RandomForestClassifier(n_estimators=300, max_depth=3, verbose=1, random_state=8) svm_model = SVC(kernel='poly', probability=True, verbose=1, random_state=8) knn_model = KNeighborsClassifier(n_neighbors=3) elm_model = MLPClassifier(hidden_layer_sizes=(80, ), activation='logistic', learning_rate_init=0.01, verbose=1) adaboost_model = AdaBoostClassifier(n_estimators=300, learning_rate=0.01, random_state=8) logitboost_model = LogitBoost(n_estimators=300, learning_rate=0.01, random_state=8) xgboost_model.fit(X_train, y_train) gradient_boost_model.fit(X_train, y_train) random_forest_model.fit(X_train, y_train) svm_model.fit(X_train, y_train) knn_model.fit(X_train, y_train) elm_model.fit(X_train, y_train) adaboost_model.fit(X_train, y_train) logitboost_model.fit(X_train, y_train) p_random_forest = random_forest_model.predict_proba(X_test) p_gradient_boost = gradient_boost_model.predict_proba(X_test) p_xgboost = xgboost_model.predict_proba(X_test) p_svm = svm_model.predict_proba(X_test) p_knn = knn_model.predict_proba(X_test) p_elm = elm_model.predict_proba(X_test) p_adaboost = adaboost_model.predict_proba(X_test) p_logitboost = logitboost_model.predict_proba(X_test) random_forest_ll = log_loss(y_test, p_random_forest) gradient_boost_ll = log_loss(y_test, p_gradient_boost) xgboost_ll = log_loss(y_test, p_xgboost) svm_ll = log_loss(y_test, p_svm) knn_ll = log_loss(y_test, p_knn) elm_ll = log_loss(y_test, p_elm) adaboost_ll = log_loss(y_test, p_adaboost) logitboost_ll = log_loss(y_test, p_logitboost) strng0 = "\n" + title strtest = "\nLength of test data: " + str(len(y_test)) strng2 = "\n------------------" strng4 = "\nGradient Boost Log Loss " + str(gradient_boost_ll) strng5 = "\nRandom Forest Log Loss " + str(random_forest_ll) strng6 = "\nXGBoost Log Loss " + str(xgboost_ll) strng7 = "\n------------------" strng9 = "\nSVM Log Loss " + str(svm_ll) strng10 = "\nKNN Log Loss " + str(knn_ll) strng11 = "\nELM Log Loss " + str(elm_ll) strng12 = "\nAdaBoost Log Loss " + str(adaboost_ll) strng13 = "\nLogitBoost Log Loss " + str(logitboost_ll) prntstr = strng0 + strtest + strng2 + strng4 + strng5 + strng6 + strng7 + strng9 + strng10 + strng11 + strng12 + strng13 print(prntstr) write_to_file(prntstr) return xgboost_model, random_forest_model, adaboost_model