Пример #1
0
def bids_work(fit, validate, tune=False):
    print "Fit/predict bids"
    model = Model()
    X, y, bidder_ids = bids_to_features(fit)
    model.standard_prepare(X, y, fit, validate)

    clf1 = linear_model.LogisticRegression(penalty='l2', C=100.0)
    clf2 = RandomForestClassifier(n_estimators=20)
    clf3 = svm.SVC(probability=True, kernel='rbf', C=1, gamma=0.0001)
    clf4 = GradientBoostingClassifier(loss='exponential')
    clf5 = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7)
    clf6 = KNeighborsClassifier()
    clf7 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                              algorithm="SAMME.R",
                              n_estimators=2000,
                              learning_rate=0.01)

    eclf = EnsembleClassifier(clfs=[clf1, clf2], voting='soft')

    cv_fit(model.X_scaled, model.y_scaled, fit, validate,
           [clf1, clf2, clf3, clf4, clf7, eclf], [
               'LogisticRegression', 'RandomForest', 'SVC', 'GradientBoosting',
               'AdaBoost', 'EnsembleClassifier'
           ], tune)

    return model, eclf
Пример #2
0
def test_EnsembleClassifier_weights():

    np.random.seed(123)    
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()
    eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='soft', weights=[1,2,10])

    scores = cross_validation.cross_val_score(eclf, X, y, cv=5, scoring='accuracy')
    scores_mean = (round(scores.mean(), 2))
    assert(scores_mean == 0.93)   
Пример #3
0
def test_EnsembleClassifier_gridsearch_enumerate_names():

    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    eclf = EnsembleClassifier(clfs=[clf1, clf1, clf2], voting='soft')

    params = {
        'logisticregression-1__C': [1.0, 100.0],
        'logisticregression-2__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [5, 20],
    }

    grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
    gs = grid.fit(iris.data, iris.target)
Пример #4
0
def test_EnsembleClassifier_gridsearch():

    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='soft')

    params = {
        'logisticregression__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [20, 200],
    }

    grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
    grid.fit(iris.data, iris.target)

    mean_scores = []
    for params, mean_score, scores in grid.grid_scores_:
        mean_scores.append(round(mean_score, 2))
    assert (mean_scores == [0.95, 0.96, 0.96, 0.95])
Пример #5
0
def work(fit, validate, tune=False):
    print "Fit/predict bidders"

    model = Model()
    X, y, ids = bidder_to_features(fit)
    model.prepare(X, y, ids, fit, validate)

    clf1 = linear_model.LogisticRegression(penalty='l2', C=100.0)
    clf2 = RandomForestClassifier(n_estimators=20)
    clf3 = svm.SVC(probability=True, kernel='rbf', C=1, gamma=0.0001)
    clf4 = GradientBoostingClassifier(loss='exponential',
                                      subsample=1,
                                      max_features='log2',
                                      learning_rate=0.1)
    clf5 = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7)
    clf6 = BaggingClassifier(n_jobs=7)
    clf7 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                              algorithm="SAMME.R",
                              n_estimators=2000,
                              learning_rate=0.01)

    eclf = EnsembleClassifier(clfs=[clf1, clf2], voting='soft')

    # bidder_sets = split_bidders()
    # for bs in bidder_sets:
    #     model.apply_set(bs)
    #     cv_fit(model.uX_scaled, model.uy_scaled, fit, validate, clf1, clf2, clf3, clf4, eclf)

    cv_fit(model.X_scaled, model.y_scaled, fit, validate,
           [clf1, clf2, clf3, clf4, clf7, eclf], [
               'LogisticRegression', 'RandomForest', 'SVC', 'GradientBoosting',
               'AdaBoost', 'EnsembleClassifier'
           ], tune)

    #cv_fit(model.X_scaled, model.y_scaled, fit, validate, [eclf],
    #    ['BaggingClassifier'], tune)

    return model, eclf
Пример #6
0
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    vectors, labels, test_size=0.2, random_state=0)

svm = joblib.load("../classifier/svm.classifier")
lg = joblib.load("../classifier/lg.classifier")
gnb = joblib.load("../classifier/naive_bayes.classifier")

df = pandas.DataFrame(columns=('w1', 'w2', 'w3', 'mean', 'std'))
i = 0
for w1 in range(1, 4):
    for w2 in range(1, 4):
        for w3 in range(1, 4):
            if len(set((w1, w2, w3))) == 1:
                continue

            eclf = EnsembleClassifier(clfs=[svm, lg, gnb],
                                      voting='soft',
                                      weights=[w1, w2, w3])
            scores = cross_validation.cross_val_score(estimator=eclf,
                                                      X=X_test,
                                                      y=y_test,
                                                      cv=5,
                                                      scoring='accuracy',
                                                      n_jobs=1)
            df.loc[i] = [w1, w2, w3, scores.mean(), scores.std()]
            i += 1

df.sort(columns=['mean', 'std'], ascending=False)
print df
print "Preparing data"
vectors = []
labels = []
for row in trainingData.data:
    vectors.append(row[2])
    labels.append(row[1])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    vectors, labels, test_size=0.2, random_state=0)

svm = joblib.load("../classifier/svm.classifier")
lg = joblib.load("../classifier/lg.classifier")
gnb = joblib.load("../classifier/naive_bayes.classifier")

eclf = EnsembleClassifier(clfs=[svm, lg, gnb], voting='soft')

for clf, label in zip(
    [svm, lg, gnb, eclf],
    ['SVM', 'Logistic Regression', 'Naive Bayes', 'Ensemble']):

    scores = cross_validation.cross_val_score(clf,
                                              X_test,
                                              y_test,
                                              cv=5,
                                              scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
          (scores.mean(), scores.std(), label))

print classification_report(y_test, eclf.predict(X_test))