def bids_work(fit, validate, tune=False): print "Fit/predict bids" model = Model() X, y, bidder_ids = bids_to_features(fit) model.standard_prepare(X, y, fit, validate) clf1 = linear_model.LogisticRegression(penalty='l2', C=100.0) clf2 = RandomForestClassifier(n_estimators=20) clf3 = svm.SVC(probability=True, kernel='rbf', C=1, gamma=0.0001) clf4 = GradientBoostingClassifier(loss='exponential') clf5 = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7) clf6 = KNeighborsClassifier() clf7 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME.R", n_estimators=2000, learning_rate=0.01) eclf = EnsembleClassifier(clfs=[clf1, clf2], voting='soft') cv_fit(model.X_scaled, model.y_scaled, fit, validate, [clf1, clf2, clf3, clf4, clf7, eclf], [ 'LogisticRegression', 'RandomForest', 'SVC', 'GradientBoosting', 'AdaBoost', 'EnsembleClassifier' ], tune) return model, eclf
def test_EnsembleClassifier_weights(): np.random.seed(123) clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='soft', weights=[1,2,10]) scores = cross_validation.cross_val_score(eclf, X, y, cv=5, scoring='accuracy') scores_mean = (round(scores.mean(), 2)) assert(scores_mean == 0.93)
def test_EnsembleClassifier_gridsearch_enumerate_names(): clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) eclf = EnsembleClassifier(clfs=[clf1, clf1, clf2], voting='soft') params = { 'logisticregression-1__C': [1.0, 100.0], 'logisticregression-2__C': [1.0, 100.0], 'randomforestclassifier__n_estimators': [5, 20], } grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) gs = grid.fit(iris.data, iris.target)
def test_EnsembleClassifier_gridsearch(): clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='soft') params = { 'logisticregression__C': [1.0, 100.0], 'randomforestclassifier__n_estimators': [20, 200], } grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) grid.fit(iris.data, iris.target) mean_scores = [] for params, mean_score, scores in grid.grid_scores_: mean_scores.append(round(mean_score, 2)) assert (mean_scores == [0.95, 0.96, 0.96, 0.95])
def work(fit, validate, tune=False): print "Fit/predict bidders" model = Model() X, y, ids = bidder_to_features(fit) model.prepare(X, y, ids, fit, validate) clf1 = linear_model.LogisticRegression(penalty='l2', C=100.0) clf2 = RandomForestClassifier(n_estimators=20) clf3 = svm.SVC(probability=True, kernel='rbf', C=1, gamma=0.0001) clf4 = GradientBoostingClassifier(loss='exponential', subsample=1, max_features='log2', learning_rate=0.1) clf5 = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7) clf6 = BaggingClassifier(n_jobs=7) clf7 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME.R", n_estimators=2000, learning_rate=0.01) eclf = EnsembleClassifier(clfs=[clf1, clf2], voting='soft') # bidder_sets = split_bidders() # for bs in bidder_sets: # model.apply_set(bs) # cv_fit(model.uX_scaled, model.uy_scaled, fit, validate, clf1, clf2, clf3, clf4, eclf) cv_fit(model.X_scaled, model.y_scaled, fit, validate, [clf1, clf2, clf3, clf4, clf7, eclf], [ 'LogisticRegression', 'RandomForest', 'SVC', 'GradientBoosting', 'AdaBoost', 'EnsembleClassifier' ], tune) #cv_fit(model.X_scaled, model.y_scaled, fit, validate, [eclf], # ['BaggingClassifier'], tune) return model, eclf
X_train, X_test, y_train, y_test = cross_validation.train_test_split( vectors, labels, test_size=0.2, random_state=0) svm = joblib.load("../classifier/svm.classifier") lg = joblib.load("../classifier/lg.classifier") gnb = joblib.load("../classifier/naive_bayes.classifier") df = pandas.DataFrame(columns=('w1', 'w2', 'w3', 'mean', 'std')) i = 0 for w1 in range(1, 4): for w2 in range(1, 4): for w3 in range(1, 4): if len(set((w1, w2, w3))) == 1: continue eclf = EnsembleClassifier(clfs=[svm, lg, gnb], voting='soft', weights=[w1, w2, w3]) scores = cross_validation.cross_val_score(estimator=eclf, X=X_test, y=y_test, cv=5, scoring='accuracy', n_jobs=1) df.loc[i] = [w1, w2, w3, scores.mean(), scores.std()] i += 1 df.sort(columns=['mean', 'std'], ascending=False) print df
print "Preparing data" vectors = [] labels = [] for row in trainingData.data: vectors.append(row[2]) labels.append(row[1]) X_train, X_test, y_train, y_test = cross_validation.train_test_split( vectors, labels, test_size=0.2, random_state=0) svm = joblib.load("../classifier/svm.classifier") lg = joblib.load("../classifier/lg.classifier") gnb = joblib.load("../classifier/naive_bayes.classifier") eclf = EnsembleClassifier(clfs=[svm, lg, gnb], voting='soft') for clf, label in zip( [svm, lg, gnb, eclf], ['SVM', 'Logistic Regression', 'Naive Bayes', 'Ensemble']): scores = cross_validation.cross_val_score(clf, X_test, y_test, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) print classification_report(y_test, eclf.predict(X_test))