def test_fit_sample_weight():
    """Check that a warning is raised if sample_weights is passed to fit()."""
    logitboost = LogitBoost()
    with pytest.warns(RuntimeWarning):
        logitboost.fit(X_simple,
                       y_simple_binary,
                       sample_weight=np.ones(len(X_simple)))
def test_bad_base_estimator():
    """Tests for errors raised when the base estimator is bad."""
    # LogitBoost base estimators should be regressors, not classifiers
    base_estimator = DecisionTreeClassifier()
    # Validation is done at fitting, not at initialization
    logitboost = LogitBoost(base_estimator)
    with pytest.raises(ValueError):
        logitboost.fit(X_simple, y_simple_binary)
示例#3
0
def boost_elasticnet(X_train, X_test, y_train, y_test):

    # applying bagging to logistic regression with elasticnet
    # Args:
    #     X_train, X_test, y_train, y_test

    # Returns:
    #     DataFrame: Preprocessed DataFrame. where Alpha and L1 ratio are hyperparameters of elastic net, estimator is hyperparameter for bagging, confusion matrix is the confusion matrix for each combination of those hyperparameters

    df = pd.DataFrame(
        columns=['Estimators', 'Learning Rate', 'Confusion Matrix'])
    rows = []
    alphas = [0.0001, 0.001, 0.01]  #,0.1,1]
    estimators = [50, 100, 150]
    rates = [0.5, 0.75, 1]

    # for al in alphas:
    #     estimator = SGDClassifier(loss = 'log',alpha= al,penalty = 'l1',random_state=0)
    for n_est in estimators:
        for rate in rates:
            ada = LogitBoost(n_estimators=n_est,
                             learning_rate=rate,
                             random_state=0)  #algorithm='SAMME',
            ada.fit(X_train, y_train)
            predicted_labels = ada.predict(X_test)
            tn, fp, fn, tp = confusion_matrix(y_test,
                                              predicted_labels,
                                              labels=[0, 1]).ravel()
            convert_matrix = [tn, fp, fn, tp]
            rows.append([n_est, rate, convert_matrix])

    for i in range(len(rows)):
        df = df.append(
            {
                'Estimators': rows[i][0],
                'Learning Rate': rows[i][1],
                'Confusion Matrix': rows[i][2]
            },
            ignore_index=True)

    return df
示例#4
0
    label="malignant",
    edgecolor="k",
    alpha=0.7,
)

plt.title("t-SNE plot of the training data")
plt.xlabel("1st embedding axis")
plt.ylabel("2nd embedding axis")
plt.legend(loc="best", frameon=True, shadow=True)

plt.tight_layout()
plt.show()
plt.close()

lboost = LogitBoost(base_estimator=LogisticRegression(),
                    n_estimators=200,
                    random_state=0)
lboost.fit(X_train, y_train)

y_pred_train = lboost.predict(X_train)
y_pred_test = lboost.predict(X_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)

print("Training accuracy: %.4f" % accuracy_train)
print("Test accuracy:     %.4f" % accuracy_test)

report_train = classification_report(y_train, y_pred_train)
report_test = classification_report(y_test, y_pred_test)
print("Training\n%s" % report_train)
def _toy_dataset_test(load_func,
                      test_size=(1. / 3),
                      random_state=0,
                      min_score_train=0.9,
                      min_score_test=0.9):
    """Create a classification unit test from a scikit-learn toy dataset."""
    # Fetch the dataset
    data = load_func()
    X = data.data
    y = data.target_names[data.target]

    # Distinct classes
    classes = data.target_names
    n_classes = len(classes)

    # Binary/multiclass classification indicator
    is_binary = (n_classes == 2)

    # Shuffle data and split it into training/testing samples
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y,
                         random_state=random_state)

    for bootstrap in (True, False):
        # Fit a LogitBoost model
        logitboost = LogitBoost(bootstrap=bootstrap, random_state=random_state)
        logitboost.fit(X_train, y_train)

        # Compute accuracy scores and assert minimum accuracy
        score_train = logitboost.score(X_train, y_train)
        score_test = logitboost.score(X_test, y_test)
        assert score_train >= min_score_train, \
            ("Failed with bootstrap=%s: training score %.3f less than %.3f"
             % (bootstrap, score_train, min_score_train))
        assert score_test >= min_score_test, \
            ("Failed with bootstrap=%s: testing score %.3f less than %.3f"
             % (bootstrap, score_test, min_score_test))

        # Get probabilities and the decision function
        predict_proba = logitboost.predict_proba(X_test)
        decision_function = logitboost.decision_function(X_test)

        # predict_proba() should always return (n_samples, n_classes)
        assert predict_proba.shape == (X_test.shape[0], n_classes)

        # decision_function() shape depends on the classification task
        if is_binary:
            assert decision_function.shape == (X_test.shape[0], )
        else:
            assert decision_function.shape == (X_test.shape[0], n_classes)

        # Check that the last item of a staged method is the same as a regular
        # method
        staged_predict = np.asarray(list(logitboost.staged_predict(X_test)))
        staged_predict_proba = \
            np.asarray(list(logitboost.staged_predict_proba(X_test)))
        staged_decision_function = \
            np.asarray(list(logitboost.staged_decision_function(X_test)))
        staged_score = \
            np.asarray(list(logitboost.staged_score(X_test, y_test)))

        np.testing.assert_equal(staged_predict[-1], logitboost.predict(X_test))
        np.testing.assert_almost_equal(staged_predict_proba[-1],
                                       logitboost.predict_proba(X_test))
        np.testing.assert_almost_equal(staged_decision_function[-1],
                                       logitboost.decision_function(X_test))
        np.testing.assert_almost_equal(staged_score[-1],
                                       logitboost.score(X_test, y_test))

        # contributions() should return one non-negative number for each
        # estimator in the ensemble
        contrib = logitboost.contributions(X_train)
        assert contrib.shape == (logitboost.n_estimators, )
        assert np.all(contrib >= 0)
def test_sklearn_api():
    """Make sure LogitBoost is minimally compliant with scikit-learn's API."""
    check_estimator(LogitBoost())
def test_feature_importances_():
    """Check that the feature_importances_ attribute behaves as expected."""
    # DecisionTreeRegressor supports feature_importances_
    logitboost = LogitBoost(DecisionTreeRegressor())
    # Binary classification should work
    logitboost.fit(X_simple, y_simple_binary)
    assert logitboost.feature_importances_.shape == (np.shape(X_simple)[1], )

    # Multiclass classification should currently fail
    logitboost.fit(X_simple, y_simple_multiclass)
    with pytest.raises(NotImplementedError):
        _ = logitboost.feature_importances_

    # Ridge doesn't support feature_importances_
    logitboost = LogitBoost(Ridge())
    # Even binary classification shouldn't work
    logitboost.fit(X_simple, y_simple_binary)
    with pytest.raises(AttributeError):
        _ = logitboost.feature_importances_

    # Check that the feature_importance_ attribute identifies bad features
    X, y = load_breast_cancer(return_X_y=True)

    # Add a useless constant feature columns to X: it should be the least
    # important
    X = np.column_stack((X, np.zeros(len(X))))

    logitboost = LogitBoost(random_state=0)
    logitboost.fit(X, y)

    feature_importances = logitboost.feature_importances_
    dummy_importance = feature_importances[-1]
    assert dummy_importance == min(feature_importances)
示例#8
0
class_weights = compute_class_weight('balanced', np.unique(training_targets), 
                                     training_targets[target_label[0]])
class_weights = dict(enumerate(class_weights))

#%% define models and parameters

# define models

models =    {
            'ExtraTreesClassifier': ExtraTreesClassifier(),
            'RandomForestClassifier': RandomForestClassifier(),
            'AdaBoostClassifier': AdaBoostClassifier(),
            'GradientBoostingClassifier': GradientBoostingClassifier(),
            'SVC': SVC(),
            'LogitBoost': LogitBoost(),
            'XGBClassifier': XGBClassifier(),
            'ComplementNB': ComplementNB()
            }

# define model parameters for parameter search

param_extra_trees =     {
                        'n_estimators': [10, 50, 100, 200, 300],
                        'min_samples_split': [2, 4],
                        'max_features': ['sqrt', None],
                        'random_state': [random_state],
                        'class_weight': [class_weights]
                        }

param_random_forest =   {
示例#9
0
    gamma=0.0,  # 惩罚项中叶子结点个数前的参数
    subsample=1,  # 所有样本建立决策树
    colsample_btree=1,  # 所有特征建立决策树
    scale_pos_weight=1,  # 解决样本个数不平衡的问题
    random_state=27,  # 随机数
    slient=0,
)
xg.fit(feature_train_balance, label_train_balance)
xg_pred = xg.predict_proba(feature_test_balance)[:, 1]
xg_evaluation = valid.evaluate(
    label_test_balance, xg_pred, save_path="../data/xg_evaluation.json"
)
plot_evaluation(label_test_balance, xg_pred, "../figure", method="XG")
#%%

lb = LogitBoost(n_estimators=200, random_state=0)  # base_estimator=LogisticRegression()
lb.fit(feature_train_balance, label_train_balance)
lb_pred = lb.predict_proba(feature_test_balance)[:, 1]
lb_evaluation = evaluate(
    label_test_balance, lb_pred, save_path="../data/lb_evaluation.json"
)
plot_evaluation(label_test_balance, lb_pred, "../figure", method="LB")
#%%
from feature import valid

#%%
# Auto-tunan_column_listne model for pandemic
# 1. XGboost
# 2. XGboost - additive learning
# 3. LogisticBoosting - additive learning
# 4. dummy Logistic
示例#10
0
plt.scatter(X_train_tsne[mask_malignant, 0],
            X_train_tsne[mask_malignant, 1],
            marker='o',
            c='r',
            label='Cancer maligno',
            edgecolor='k',
            alpha=0.7)
plt.title('Gráfico t-SNE dos dados de treinamento')
plt.xlabel('1st embedding axis')
plt.ylabel('2nd embedding axis')
plt.legend(loc='best', frameon=True, shadow=True)
plt.tight_layout()
plt.show()

#carrega o nosso algoritmo de classificação baseada e árvore de decisão (árvores de decisão com profundidade 1,
lboost = LogitBoost(n_estimators=200, random_state=0)

# realiza o treinamento dos dados
lboost.fit(X_train, y_train)

# faz a validação dos dados de treino e teste (tenta prevê se o cancer é maligno ou benigno)
y_pred_train = lboost.predict(X_train)
y_pred_test = lboost.predict(X_test)

# calcula a porcentagem de acurácia
accuracy_train = (accuracy_score(y_train, y_pred_train) * 100)
accuracy_test = (accuracy_score(y_test, y_pred_test) * 100)

print('Training accuracy: %.1f' % accuracy_train)
print('Test accuracy:     %.1f' % accuracy_test)
示例#11
0
def main(plot=True, M=8, n_fold=10):

	"""

	:param plot: whether to plot the train and test result
	:param M: maximum T to search would be 100*M
	:param n_fold: number of folds in cross validation
	:return: None
	"""

	train_file, test_file = "datasets/abalone_train_scaled.txt", "datasets/abalone_test_scaled.txt"

	train = sparse.csr_matrix(np.loadtxt(train_file, delimiter=","))
	test = sparse.csr_matrix(np.loadtxt(test_file, delimiter=","))

	m = train.shape[0]  #3133
	x_dim = train.shape[1] - 1  #10
	x_train, y_train = train[:, :x_dim].toarray(), train[:, x_dim].toarray().squeeze()
	x_test, y_test = test[:, :x_dim].toarray(), test[:, x_dim].toarray().squeeze()
	#print(x_train.shape, y_train.shape, x_test.shape)
	#print(x_train)


	aboost_train_cverror = list(np.ones(M))
	lboost_train_cverror = list(np.ones(M))
	for multiple in range(1, M+1):
		T = multiple * 100
		print("\nT = %s\t" % T)
		# Set AdaBoost parameters
		# decision stump is the default base estimator
		aboost = AdaBoostClassifier(n_estimators=T, random_state=0)
		# Set LogitBoost parameters
		lboost = LogitBoost(n_estimators=T, random_state=0)
		# get 10-fold cross validation error
		aboost_cv_results = cross_validate(aboost, x_train, y_train, cv=n_fold)
		lboost_cv_results = cross_validate(lboost, x_train, y_train, cv=n_fold)
		# compute error by 1 - accuracy
		aboost_train_cverror[multiple-1] = 1 - aboost_cv_results['test_score']
		lboost_train_cverror[multiple-1] = 1 - lboost_cv_results['test_score']
	aboost_train_cverror = np.stack(aboost_train_cverror)
	lboost_train_cverror = np.stack(lboost_train_cverror)
	print(aboost_train_cverror)
	print(lboost_train_cverror)

	# find the T that gives least error (the best cross-validation accuracy)
	a_train_cverror_mean, a_train_cverror_std = aboost_train_cverror.mean(axis=1), aboost_train_cverror.std(axis=1)
	argmin = a_train_cverror_mean.flatten().argmin()
	best_T_aboost = int(argmin+1) * 100
	print("----------------------\n",\
		  "AdaBoost iteration number T = %s\n"%(best_T_aboost), \
		  "----------------------\n")
	# find the T that gives least error (the best cross-validation accuracy)
	l_train_cverror_mean, l_train_cverror_std = lboost_train_cverror.mean(axis=1), lboost_train_cverror.std(axis=1)
	argmin = l_train_cverror_mean.flatten().argmin()
	best_T_lboost = int(argmin + 1) * 100
	print("----------------------\n", \
		  "LogitBoost iteration number T = %s\n" % (best_T_lboost), \
		  "----------------------\n")

	print('Now train with the best T=T* and eval on the test set\n')

	# Train on the whole train set
	aboost = AdaBoostClassifier(n_estimators=best_T_aboost, random_state=0)
	aboost.fit(x_train, y_train)
	lboost = LogitBoost(n_estimators=best_T_lboost, random_state=0)
	lboost.fit(x_train, y_train)
	# Test on the test set
	y_pred_train = aboost.predict(x_train)
	y_pred_test = aboost.predict(x_test)
	a_error_train = 1-accuracy_score(y_train, y_pred_train)
	a_error_test = 1-accuracy_score(y_test, y_pred_test)
	print("AdaBoost train error: %s test error: %s" % (a_error_train, a_error_test))

	y_pred_train = lboost.predict(x_train)
	y_pred_test = lboost.predict(x_test)
	l_error_train = 1-accuracy_score(y_train, y_pred_train)
	l_error_test = 1-accuracy_score(y_test, y_pred_test)
	print("LogitBoost train error: %s test error: %s"%(l_error_train, l_error_test))


	if plot:
		plt.figure()
		x_values = range(100, M*100+1, 100)
		plt.plot(x_values, a_train_cverror_mean, label="AdaBoost")
		plt.fill_between(x_values,
						a_train_cverror_mean + a_train_cverror_std,
						a_train_cverror_mean - a_train_cverror_std,
						alpha=0.5, edgecolor='blue', facecolor='blue')
		plt.plot(x_values, l_train_cverror_mean, label="LogitBoost")
		plt.fill_between(x_values,
						l_train_cverror_mean + l_train_cverror_std,
						l_train_cverror_mean - l_train_cverror_std,
						alpha=0.5, edgecolor='#FF9848', facecolor='#FF9848')
		plt.xlabel('T (number of iterations/classifiers)')
		plt.ylabel('10fold cross validation train error')
		plt.legend()
		plt.ylim(0, 0.5)
		plt.savefig('B.i_cverror.png')
示例#12
0
def ensemble_comp(X_train, X_test, y_train, y_test, title):
    clf1 = XGBClassifier(learning_rate=0.01,
                         max_depth=3,
                         n_estimators=700,
                         random_state=8)
    clf2 = GradientBoostingClassifier(learning_rate=0.01,
                                      max_depth=4,
                                      max_features='log2',
                                      min_samples_leaf=4,
                                      n_estimators=280,
                                      subsample=0.25,
                                      random_state=8)
    clf3 = RandomForestClassifier(n_estimators=300,
                                  max_depth=3,
                                  verbose=1,
                                  random_state=8)
    clf4 = SVC(kernel='poly', probability=True, verbose=1, random_state=8)
    clf5 = KNeighborsClassifier(n_neighbors=3)
    clf6 = MLPClassifier(hidden_layer_sizes=(80, ),
                         activation='logistic',
                         learning_rate_init=0.01,
                         verbose=1)
    clf7 = AdaBoostClassifier(n_estimators=300,
                              learning_rate=0.01,
                              random_state=8)
    clf8 = LogitBoost(n_estimators=300, learning_rate=0.01, random_state=8)
    complete_voting_model = VotingClassifier(estimators=[('xgb', clf1),
                                                         ('gb', clf2),
                                                         ('rf', clf3),
                                                         ('svm', clf4),
                                                         ('knn', clf5),
                                                         ('elm', clf6),
                                                         ('ada', clf7),
                                                         ('logit', clf8)],
                                             voting='soft')
    new_voting_model = VotingClassifier(estimators=[
        ('svm', clf4), ('knn', clf5), ('elm', clf6), ('ada', clf7),
        ('logit', clf8)
    ],
                                        voting='soft')
    new_voting_model_without_logit = VotingClassifier(estimators=[
        ('svm', clf4), ('knn', clf5), ('elm', clf6), ('ada', clf7)
    ],
                                                      voting='soft')
    original_voting_model = VotingClassifier(estimators=[('xgb', clf1),
                                                         ('gb', clf2),
                                                         ('rf', clf3)],
                                             voting='soft')

    complete_voting_model.fit(X_train, y_train)
    new_voting_model.fit(X_train, y_train)
    new_voting_model_without_logit.fit(X_train, y_train)
    original_voting_model.fit(X_train, y_train)

    p_complete_voting = complete_voting_model.predict_proba(X_test)
    p_new_voting = new_voting_model.predict_proba(X_test)
    p_new_voting_without_logit = new_voting_model_without_logit.predict_proba(
        X_test)
    p_original_voting = original_voting_model.predict_proba(X_test)

    complete_voting_ll = log_loss(y_test, p_complete_voting)
    new_voting_ll = log_loss(y_test, p_new_voting)
    new_voting_without_logit_ll = log_loss(y_test, p_new_voting_without_logit)
    original_voting_ll = log_loss(y_test, p_original_voting)

    #reset models so that overfitting doesn't occur
    complete_voting_model = VotingClassifier(estimators=[('xgb', clf1),
                                                         ('gb', clf2),
                                                         ('rf', clf3),
                                                         ('svm', clf4),
                                                         ('knn', clf5),
                                                         ('elm', clf6),
                                                         ('ada', clf7),
                                                         ('logit', clf8)],
                                             voting='soft')
    new_voting_model = VotingClassifier(estimators=[
        ('svm', clf4), ('knn', clf5), ('elm', clf6), ('ada', clf7),
        ('logit', clf8)
    ],
                                        voting='soft')
    new_voting_model_without_logit = VotingClassifier(estimators=[
        ('svm', clf4), ('knn', clf5), ('elm', clf6), ('ada', clf7)
    ],
                                                      voting='soft')
    original_voting_model = VotingClassifier(estimators=[('xgb', clf1),
                                                         ('gb', clf2),
                                                         ('rf', clf3)],
                                             voting='soft')

    X = np.concatenate((X_train, X_test), axis=0)
    y = np.concatenate((y_train, y_test))
    complete_voting_accuracy = np.mean(
        cross_val_score(complete_voting_model, X, y, cv=5))
    new_voting_accuracy = np.mean(cross_val_score(new_voting_model, X, y,
                                                  cv=5))
    new_voting_without_logit_accuracy = np.mean(
        cross_val_score(new_voting_model_without_logit, X, y, cv=5))
    original_voting_accuracy = np.mean(
        cross_val_score(original_voting_model, X, y, cv=5))

    titlestr = "\n" + title
    str2 = "\nTotal Ensemble Log Loss " + str(complete_voting_ll)
    str3 = "\nNew Ensemble Log Loss " + str(new_voting_ll)
    str4 = "\nNew Ensemble without LogitBoost Log Loss " + str(
        new_voting_without_logit_ll)
    str5 = "\nOriginal Ensemble (Gradient Boost, Random Forest, and XGBoost) Log Loss " + str(
        original_voting_ll)
    str6 = "\n\nTotal Ensemble Mean Cross Fold Accuracy " + str(
        complete_voting_accuracy)
    str7 = "\nNew Ensemble Mean Cross Fold Accuracy " + str(
        new_voting_accuracy)
    str8 = "\nNew Ensemble without LogitBoost Cross Mean Fold Accuracy " + str(
        new_voting_without_logit_accuracy)
    str9 = "\nOriginal Ensemble (Gradient Boost, Random Forest, and XGBoost) Mean Accuracy " + str(
        original_voting_accuracy)
    lenstr = "\nAverage size of fold: " + str(len(y) / 5)
    printstr = titlestr + str2 + str3 + str4 + str5 + str6 + str7 + str8 + str9 + lenstr
    print(printstr)
    write_to_file(printstr)
示例#13
0
def model_comp(X_train, X_test, y_train, y_test, title=""):
    xgboost_model = XGBClassifier(learning_rate=0.01,
                                  max_depth=3,
                                  n_estimators=700,
                                  random_state=8)
    gradient_boost_model = GradientBoostingClassifier(learning_rate=0.01,
                                                      max_depth=4,
                                                      max_features='log2',
                                                      min_samples_leaf=4,
                                                      n_estimators=280,
                                                      subsample=0.25,
                                                      random_state=8)
    random_forest_model = RandomForestClassifier(n_estimators=300,
                                                 max_depth=3,
                                                 verbose=1,
                                                 random_state=8)
    svm_model = SVC(kernel='poly', probability=True, verbose=1, random_state=8)
    knn_model = KNeighborsClassifier(n_neighbors=3)
    elm_model = MLPClassifier(hidden_layer_sizes=(80, ),
                              activation='logistic',
                              learning_rate_init=0.01,
                              verbose=1)
    adaboost_model = AdaBoostClassifier(n_estimators=300,
                                        learning_rate=0.01,
                                        random_state=8)
    logitboost_model = LogitBoost(n_estimators=300,
                                  learning_rate=0.01,
                                  random_state=8)

    xgboost_model.fit(X_train, y_train)
    gradient_boost_model.fit(X_train, y_train)
    random_forest_model.fit(X_train, y_train)
    svm_model.fit(X_train, y_train)
    knn_model.fit(X_train, y_train)
    elm_model.fit(X_train, y_train)
    adaboost_model.fit(X_train, y_train)
    logitboost_model.fit(X_train, y_train)

    p_random_forest = random_forest_model.predict_proba(X_test)
    p_gradient_boost = gradient_boost_model.predict_proba(X_test)
    p_xgboost = xgboost_model.predict_proba(X_test)
    p_svm = svm_model.predict_proba(X_test)
    p_knn = knn_model.predict_proba(X_test)
    p_elm = elm_model.predict_proba(X_test)
    p_adaboost = adaboost_model.predict_proba(X_test)
    p_logitboost = logitboost_model.predict_proba(X_test)

    random_forest_ll = log_loss(y_test, p_random_forest)
    gradient_boost_ll = log_loss(y_test, p_gradient_boost)
    xgboost_ll = log_loss(y_test, p_xgboost)
    svm_ll = log_loss(y_test, p_svm)
    knn_ll = log_loss(y_test, p_knn)
    elm_ll = log_loss(y_test, p_elm)
    adaboost_ll = log_loss(y_test, p_adaboost)
    logitboost_ll = log_loss(y_test, p_logitboost)

    strng0 = "\n" + title
    strtest = "\nLength of test data: " + str(len(y_test))
    strng2 = "\n------------------"
    strng4 = "\nGradient Boost Log Loss " + str(gradient_boost_ll)
    strng5 = "\nRandom Forest Log Loss " + str(random_forest_ll)
    strng6 = "\nXGBoost Log Loss " + str(xgboost_ll)
    strng7 = "\n------------------"
    strng9 = "\nSVM Log Loss " + str(svm_ll)
    strng10 = "\nKNN Log Loss " + str(knn_ll)
    strng11 = "\nELM Log Loss " + str(elm_ll)
    strng12 = "\nAdaBoost Log Loss " + str(adaboost_ll)
    strng13 = "\nLogitBoost Log Loss " + str(logitboost_ll)
    prntstr = strng0 + strtest + strng2 + strng4 + strng5 + strng6 + strng7 + strng9 + strng10 + strng11 + strng12 + strng13
    print(prntstr)
    write_to_file(prntstr)

    return xgboost_model, random_forest_model, adaboost_model