示例#1
0
def train_and_test(X_train, y_train, X_test, y_test):
    '''
    Script for performing adaboost analysis
    :param X_train: training features
    :param y_train: training tags
    :param X_test:  testing features
    :param y_test:  testing tags
    :return:
    '''
    ''' Refs: https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
        https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV '''
    C, max_iter = [math.pow(2, i) for i in range(-4, 8)], [1e6]

    # Parameter tuning search scheme
    param_grid = [{
        "n_estimators": [10]
    }, {
        "n_estimators": [50]
    }, {
        "n_estimators": [100]
    }, {
        "n_estimators": [300]
    }, {
        "n_estimators": [500]
    }, {
        "n_estimators": [1000]
    }]
    ''' https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
        
        implements "one-vs-the-rest" multi-class strategy
        (preferred to "one-vs-one" because of significantly less runtime for
        similar results '''

    # Search for optimized parameters and fitting the model
    clf = GridSearchCV(
        ada(n_estimators=500),
        param_grid,
        scoring='accuracy',
        iid=False,  # return average score across folds
        cv=3)

    clf.fit(X_train, y_train)
    print('Best params set found on training set:\n',
          clf.best_params_)  # Best parameters

    print('\nGrid (mean accuracy) scores on training set:\n')  # Print score
    means = clf.cv_results_['mean_test_score']
    for mean, params in zip(means, clf.cv_results_['params']):
        print("%0.3f for %r" % (mean, params))

    print('\nDetailed classification report:\n')
    y_pred = clf.predict(X_test)
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
    print(classification_report(y_test, y_pred))
    # https://scikit-learn.org/stable/modules/classes.html#classification-metrics
    print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
示例#2
0
文件: test.py 项目: Beathmart/ml
def adaboost_test():
    n_samples = 5000
    n_features = 10
    n_informative = 8
    random_state = 19
    n_clusters_per_class = 1
    max_depth = 3

    from sklearn.datasets import make_classification
    X, y = make_classification(n_samples=n_samples,
                               n_features=n_features,
                               n_informative=n_informative,
                               n_redundant=0,
                               n_clusters_per_class=n_clusters_per_class,
                               n_classes=2,
                               random_state=random_state,
                               class_sep=1.0)
    columns = []
    for i in range(1, n_features + 1):
        columns.append('f%d' % i)
    columns.append('y')
    df = pd.DataFrame(np.concatenate((X, y.reshape(-1, 1)), axis=1),
                      columns=columns)
    df['y'] = df['y'].astype('int')
    X = df.drop('y', axis=1)
    y = df['y']

    # negatives = df[df['y'] == 0].drop('y', axis=1).values
    # positives = df[df['y'] == 1].drop('y', axis=1).values
    #
    # plt.scatter(negatives[:, 0], negatives[:, 1], c='blue')
    # plt.scatter(positives[:, 0], positives[:, 1], c='red')
    # plt.show()

    from sklearn.model_selection import train_test_split
    X_train, X_vali, y_train, y_vali = train_test_split(
        X, y, test_size=0.3, random_state=random_state)

    from sklearn.tree import DecisionTreeClassifier
    dtc = DecisionTreeClassifier(max_depth=max_depth)
    dtc.fit(X_train, y_train)
    print('DecisionTreeClassifier score: ', dtc.score(X_vali, y_vali))

    from ml.ensemble import AdaBoostClassifier
    abc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth),
                             n_estimators=50)
    abc.fit(X_train, y_train)
    print('AdaBoostClassifier score: ', abc.score(X_vali, y_vali))

    from sklearn.ensemble import AdaBoostClassifier as ada
    ad = ada(DecisionTreeClassifier(max_depth=max_depth), n_estimators=50)
    ad.fit(X_train, y_train)
    print('sklearn adaboost score: ', ad.score(X_vali, y_vali))
def train_and_test(X_train, y_train, X_test, y_test):
    ''' https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
        https://stats.stackexchange.com/questions/31066/what-is-the-influence-of-c-in-svms-with-linear-kernel
        https://stats.stackexchange.com/questions/43943/which-search-range-for-determining-svm-optimal-c-and-gamma-parameters
        https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV '''
    C, max_iter = [math.pow(2, i) for i in range(-4, 8)], [1e6]
    #param_grid = [{}]
    #param_grid = [{'loss': ['squared_hinge'], 'dual': [False],
    #'C': C, 'max_iter': max_iter},
    #{'loss': ['hinge'], 'C': C, 'max_iter': max_iter}]
    param_grid = [{
        "n_estimators": [10]
    }, {
        "n_estimators": [50]
    }, {
        "n_estimators": [100]
    }, {
        "n_estimators": [300]
    }, {
        "n_estimators": [500]
    }, {
        "n_estimators": [1000]
    }]
    #param_grid = [{"n_estimators": [10,50,100,300,500,1000]}]
    ''' https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
        
        implements "one-vs-the-rest" multi-class strategy
        (preferred to "one-vs-one" because of significantly less runtime for
        similar results '''
    clf = GridSearchCV(
        ada(n_estimators=500),
        param_grid,
        scoring='accuracy',
        iid=False,  # return average score across folds
        cv=3)

    clf.fit(X_train, y_train)
    print('Best params set found on training set:\n', clf.best_params_)

    print('\nGrid (mean accuracy) scores on training set:\n')
    means = clf.cv_results_['mean_test_score']
    for mean, params in zip(means, clf.cv_results_['params']):
        print("%0.3f for %r" % (mean, params))

    print('\nDetailed classification report:\n')
    y_pred = clf.predict(X_test)
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
    print(classification_report(y_test, y_pred))
    # https://scikit-learn.org/stable/modules/classes.html#classification-metrics
    print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
示例#4
0
    allData_X = allData[allData.axes[1].tolist()]
    allData_X = allData_X.drop('Class', axis=1)

    result_file.write("\nUsing features:\n" + str(allData_X.axes[1].tolist()))
    result_file.write("\n\nTrain Split: " + str(1 - test_split))
    result_file.write("\nTest Split: " + str(test_split))
    allData_y = allData['Class']

    train_X, test_X, train_y, test_y = train_test_split(allData_X,
                                                        allData_y,
                                                        test_size=test_split,
                                                        random_state=0)

    weight_list = (train_y * weight + 1)
    lr = LogisticRegression(penalty='l1')
    ab_clf = ada(DecisionTreeClassifier(max_depth=1), n_estimators=50)
    rf_clf = rfc(n_estimators=50)

    print("Starting Logistic Regression")
    lr.fit(train_X, train_y, sample_weight=weight_list)
    predictions = lr.predict(test_X)

    lr_recall_score = recall_score(test_y, predictions)
    lr_report = classification_report(test_y, predictions)

    lr_f1 = f1_score(test_y, predictions)
    result_file.write("\n\nLogistic Regression:")

    print(lr_report)
    result_file.write("\n" + lr_report)
    print("Finished Logistic Regression")
示例#5
0
final_model = grid_search.best_estimator_

#Evaluate the best model on the test data
final_model.fit(train_features, train_labels)
preds = final_model.predict_proba(test_features)[:, 1]
baseline_auc88 = roc_auc_score(test_labels, preds)
print(
    'The final tuned KNN_model scores {:.5f} ROC AUC on the test set.'.format(
        baseline_auc88))

# In[ ]:

#------<Mode9: adaboost>----------
from sklearn.ensemble import AdaBoostClassifier as ada
#Establish a baseline model
base_ada = ada()

# Default hyperparamters
hyperparameters = base_ada.get_params()
print(hyperparameters)

ada_scores = cross_val_score(base_ada,
                             train_features,
                             train_labels,
                             scoring='roc_auc',
                             cv=10)
print('The mean AUC for AdaBoost is:', ada_scores.mean())
base_ada.fit(train_features, train_labels)

# Actual class predictions
ada_predictions = base_ada.predict(test_features)
示例#6
0
 y = np.load('data/y_boston.npy')
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 regressors = [
     lr(),
     bay(),
     rr(alpha=.5, random_state=0),
     l(alpha=0.1, random_state=0),
     ll(),
     knn(),
     ard(),
     rfr(random_state=0, n_estimators=100),
     SVR(gamma='scale', kernel='rbf'),
     rcv(fit_intercept=False),
     en(random_state=0),
     dtr(random_state=0),
     ada(random_state=0),
     gbr(random_state=0)
 ]
 print('unscaled:', br)
 for reg in regressors:
     reg.fit(X_train, y_train)
     rmse, name = get_error(reg, X_test, y_test)
     name = reg.__class__.__name__
     print(name + '(rmse):', end=' ')
     print(rmse)
 print()
 print('scaled:', br)
 scaler = StandardScaler()
 X_train_std = scaler.fit_transform(X_train)
 X_test_std = scaler.fit_transform(X_test)
 for reg in regressors: