예제 #1
0
def test_staged_predict():
    """Check staged predictions."""
    # AdaBoost classification
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, n_estimators=10)
        clf.fit(iris.data, iris.target)

        predictions = clf.predict(iris.data)
        staged_predictions = [p for p in clf.staged_predict(iris.data)]
        proba = clf.predict_proba(iris.data)
        staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
        score = clf.score(iris.data, iris.target)
        staged_scores = [s for s in clf.staged_score(iris.data, iris.target)]

        assert_equal(len(staged_predictions), 10)
        assert_array_almost_equal(predictions, staged_predictions[-1])
        assert_equal(len(staged_probas), 10)
        assert_array_almost_equal(proba, staged_probas[-1])
        assert_equal(len(staged_scores), 10)
        assert_array_almost_equal(score, staged_scores[-1])

    # AdaBoost regression
    clf = AdaBoostRegressor(n_estimators=10)
    clf.fit(boston.data, boston.target)

    predictions = clf.predict(boston.data)
    staged_predictions = [p for p in clf.staged_predict(boston.data)]
    score = clf.score(boston.data, boston.target)
    staged_scores = [s for s in clf.staged_score(boston.data, boston.target)]

    assert_equal(len(staged_predictions), 10)
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert_equal(len(staged_scores), 10)
    assert_array_almost_equal(score, staged_scores[-1])
예제 #2
0
파일: cook.py 프로젝트: wangchr/eMeriL
def cook():
    x, y, weights = load_data()
    n_components = 200
    svd = TruncatedSVD(n_components, random_state=42)
    x_unweighted = svd.fit_transform(x)
    x_weighted = svd.fit_transform(weighted(x, weights))

    for i in range(9):
        frac = 1 - (i * 0.01 + 0.01)
        print frac

        x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Unweighted: ", classifier.score(x_test, y_test)

        x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Weighted: ", classifier.score(x_test, y_test)

        print '--------------------------'


    '''
예제 #3
0
class AdaBoost:
    def __init__(self, data, n_estimators=50, learning_rate=1.0):
        features, weights, labels = data
        self.clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None
        self.dataset = split_dataset(features, weights, labels)

    def train(self):
        """
        Train Ada Boost on the higgs dataset
        """
        self.clf = self.clf.fit(self.dataset['training']['features'], self.dataset['training']['labels'])

    def predict(self):
        """
        Predict label using Ada Boost
        :return:
        """
        self.predictions = self.clf.predict(self.dataset['test']['features'])

    def evaluate(self):
        self.trnaccuracy = self.clf.score(self.dataset['training']['features'],
                                          self.dataset['training']['labels'],
                                          sample_weight=self.dataset['training']['weights'])
        self.tstaccuracy = self.clf.score(self.dataset['test']['features'],
                                          self.dataset['test']['labels'],
                                          sample_weight=self.dataset['test']['weights'])
예제 #4
0
def cvalidate():
    targetset = np.genfromtxt(open('trainLabels.csv','r'), dtype='f16')
    y = [x for x in targetset]

    trainset = np.genfromtxt(open('train.csv','r'), delimiter=',', dtype='f16')
    X = np.array([x for x in trainset])
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0)

    X_train, X_test = decomposition_pca(X_train, X_test)

    #SVM

    c_range = 10.0 ** np.arange(6.5,7.5,.25)
    gamma_range = 10.0 ** np.arange(-2.5,0.5,.25)
    parameters = {'kernel':['rbf'], 'C':c_range,  'gamma':gamma_range} 
    svr = SVC()

    clf = grid_search.GridSearchCV(svr, parameters)
    

    clf.fit(X_train, y_train)
    bdt = AdaBoostClassifier(base_estimator = clf.best_estimator_,
                         algorithm="SAMME",
                         n_estimators=100)

    
    #bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=10))
    bdt.fit(X_train, y_train)
    

    print bdt.score(X_test, y_test)
def test_pickle():
    # Check pickability.
    import pickle

    # Adaboost classifier
    for alg in ['SAMME', 'SAMME.R']:
        obj = AdaBoostClassifier(algorithm=alg)
        obj.fit(iris.data, iris.target)
        score = obj.score(iris.data, iris.target)
        s = pickle.dumps(obj)

        obj2 = pickle.loads(s)
        assert_equal(type(obj2), obj.__class__)
        score2 = obj2.score(iris.data, iris.target)
        assert_equal(score, score2)

    # Adaboost regressor
    obj = AdaBoostRegressor(random_state=0)
    obj.fit(boston.data, boston.target)
    score = obj.score(boston.data, boston.target)
    s = pickle.dumps(obj)

    obj2 = pickle.loads(s)
    assert_equal(type(obj2), obj.__class__)
    score2 = obj2.score(boston.data, boston.target)
    assert_equal(score, score2)
예제 #6
0
def cvalidate():
    from sklearn import cross_validation

    trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:]
    X = np.array([x[1:8] for x in trainset])
    y = np.array([x[8] for x in trainset])
    #print X,y
    import math
    for i, x in enumerate(X):
        for j, xx in enumerate(x):
            if(math.isnan(xx)):
                X[i][j] = 26.6
   
    #print X[0:3]
    #print y[0:3]
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0)

    X_train, X_test = decomposition_pca(X_train, X_test)
    
    bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200)
    bdt.fit(X_train, y_train)
    
    

    print bdt.score(X_test, y_test)
예제 #7
0
class Model_Adaboost(object):
    def __init__(self,model,parameter = {"n_estimators" : 50, "CV_size": 0}):
        self.train = model.train
        self.test = model.test
        self.CVsize = float(parameter["CV_size"].get())
        train = np.array(self.train)
        self.X_train = train[:, :-1]
        self.y_train = train[:, -1]
        self.X_train,self.X_CV,self.y_train,self.y_CV = train_test_split(self.X_train, self.y_train, test_size=self.CVsize)
        if self.CVsize == 0:
            self.clf = AdaBoostClassifier(n_estimators = int(parameter["n_estimators"].get()))
        self.model = model

    def fit(self):
        self.clf.fit(self.X_train,self.y_train)

    def score(self):
        pre = self.clf.predict(self.X_train)
        truth = self.y_train
        print ("score: " + str(self.clf.score(self.X_train,truth)))
        print ("f1: " + str(f1_score(truth,pre, average=None)))
        print ("AUC score: " + str(roc_auc_score(truth,pre)))

    def save_results(self):
        pre = self.model.clf.predict(self.model.test)
        df = pd.DataFrame({"predict":pre})
        fileName = tkFileDialog.asksaveasfilename()
        df.to_csv(fileName)

    def crossValidation(self):
        estimatorList = [3,5,7,10,13,15,20,25,30,50]
        bestScore = [0,0] #score,n_estimator
        bestF1ScoreNeg = [0,0]
        bestF1ScorePos = [0,0]
        #bestAUCScore = [0,0]
        for e in estimatorList:
            self.clf = AdaBoostClassifier(n_estimators = e)
            self.clf.fit(self.X_train,self.y_train)
            pre = self.clf.predict(self.X_CV)
            truth = self.y_CV
            score = self.clf.score(self.X_CV,truth)
            if score > bestScore[0]:
                bestScore[0] = score
                bestScore[1] = e

            f1pos = f1_score(truth,pre, average=None)[1]
            if f1pos > bestF1ScorePos[0]:
                bestF1ScorePos[0] = f1pos
                bestF1ScorePos[1] = e

            f1neg = f1_score(truth,pre, average=None)[0]
            if f1neg > bestF1ScoreNeg[0]:
                bestF1ScoreNeg[0] = f1neg
                bestF1ScoreNeg[1] = e

        print ("Adaboost:")
        print ("Best [score,n_estimators] on Cross Validation set: " + str(bestScore))
        print ("Best [f1(pos),n_estimators] on Cross Validation set: " + str(bestF1ScorePos))
        print ("Best [f1(neg),n_estimators] on Cross Validation set" + str(bestF1ScoreNeg))
def adaboost(df,label_name,feature_names,features_len,ifeat,n_estimators=100):
    # TODO: just copied from RF, needs real code
    from sklearn.ensemble import AdaBoostClassifier
    print('---------------------------------------------------')
    print(ifeat,features_len,'Adaboost, features:',feature_names)
    df_train_Y = df[label_name]
    train_Y = df_train_Y.values.ravel()  # turn from 2D to 1D

    df_train_X = df[feature_names]
    train_X = df_train_X.values

    clf =AdaBoostClassifier(n_estimators=n_estimators)
    clf = clf.fit(train_X,train_Y)
    # output = clf.predict(train_X)
    E_in = round(1.-clf.score(train_X, train_Y),5) # 'in sample' error
    #print('\tE_in :',E_in)

    # -----
    # Kfold as estimator for 'out of sample' error
    kf=skl.cross_validation.KFold(n=len(train_X), n_folds=5)
    cv_scores=skl.cross_validation.cross_val_score(clf, train_X, y=train_Y, cv=kf)
    E_out = round(1.-np.mean(cv_scores),5)
    #print("\tE_out:",E_out)

    return E_in,E_out
def test_iris():
    # Check consistency on dataset iris.
    classes = np.unique(iris.target)
    clf_samme = prob_samme = None

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)
        clf.fit(iris.data, iris.target)

        assert_array_equal(classes, clf.classes_)
        proba = clf.predict_proba(iris.data)
        if alg == "SAMME":
            clf_samme = clf
            prob_samme = proba
        assert_equal(proba.shape[1], len(classes))
        assert_equal(clf.decision_function(iris.data).shape[1], len(classes))

        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
            (alg, score)

    # Somewhat hacky regression test: prior to
    # ae7adc880d624615a34bafdb1d75ef67051b8200,
    # predict_proba returned SAMME.R values for SAMME.
    clf_samme.algorithm = "SAMME.R"
    assert_array_less(0,
                      np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
예제 #10
0
def prediction(feat,label):
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(feat, label, test_size = 0.25, random_state = 0)
    num_leaves = []
    accuracy_score = []
    auc_score = []
    # for depth in range(1,10):
    #     clf = tree.DecisionTreeClassifier(max_depth = depth)
    #     clf.fit(x_train,y_train)
    #     predictions = clf.predict(x_test)
    #     accuracy = clf.score(x_test,y_test)
    #     auc = metrics.roc_auc_score(y_test,predictions)
    #     num_leaves.append(depth)
    #     accuracy_score.append(accuracy)
    #     auc_score.append(auc)

    for depth in range(1,10):
        clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth = depth), n_estimators = 100)
        clf.fit(x_train,y_train)
        predictions = clf.predict(x_test)
        accuracy = clf.score(x_test,y_test)
        auc = metrics.roc_auc_score(y_test,predictions)
        num_leaves.append(depth)
        accuracy_score.append(accuracy)
        auc_score.append(auc)


    return num_leaves,accuracy_score,auc_score
예제 #11
0
def ADA_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting  AdaBoost Classifier***************")
    t0 = time()
    clf = AdaBoostClassifier(n_estimators=300)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("AdaBoost Classifier - {0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)
    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)

    print("***************Ending AdaBoost Classifier***************")
    return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
예제 #12
0
파일: q5.py 프로젝트: oryband/homework
def adaboost_skin(X_train, y_train, X_test, y_test):
    """Learn the skin data sets with AdaBoost.

    X_*: Samples.
    y_*: labels.
    """
    print 'AdaBoost'

    min_iter = 1
    max_iter = 200
    steps = 30
    diff = (max_iter - min_iter) / steps
    iterations = [min_iter + diff * step for step in xrange(steps+1)]
    scores = []
    for T in iterations:

        clf = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=1),
            algorithm="SAMME",
            n_estimators=T)

        clf.fit(X_train.toarray(), y_train)
        scores.append(100 * clf.score(X_test.toarray(), y_test))

        print '\t%d Iterations: %.2f%%' % (T, scores[-1])

    return iterations, scores
예제 #13
0
def boost_report():
  svm_train_features = list()
  svm_train_classes = list()
  svm_test_features = list()
  svm_test_classes = list()

  for record in mit_records:
    svm_train_features.append(list(record.features.values()))
    svm_train_classes.append(record.my_class)
  for record in mim_records:
    svm_test_features.append(list(record.features.values()))
    svm_test_classes.append(record.my_class)

  svm_classifier = svm.SVC(kernel="linear", C=0.1)
  svm_classifier.fit(svm_train_features, svm_train_classes)
  print("linear kernel svm accuracy: " +
        str(svm_classifier.score(svm_test_features, svm_test_classes)))

  classifier = AdaBoostClassifier(
    base_estimator=svm_classifier,
    n_estimators=100,
    algorithm='SAMME')
  classifier.fit(svm_train_features, svm_train_classes)
  print("adaboost accuracy: " +
        str(classifier.score(svm_test_features, svm_test_classes)))
예제 #14
0
class AdaBoostcls(object):
    """docstring for ClassName"""
    def __init__(self):
        self.adaboost_cls = AdaBoostClassifier()
        self.prediction = None
        self.train_x = None
        self.train_y = None

    def train_model(self, train_x, train_y):
        try:
            self.train_x = train_x
            self.train_y = train_y
            self.adaboost_cls.fit(train_x, train_y)
        except:
            print(traceback.format_exc())

    def predict(self, test_x):
        try:
            self.test_x = test_x
            self.prediction = self.adaboost_cls.predict(test_x)
            return self.prediction
        except:
            print(traceback.format_exc())

    def accuracy_score(self, test_y):
        try:
            # return r2_score(test_y, self.prediction)
            return self.adaboost_cls.score(self.test_x, test_y)
        except:
            print(traceback.format_exc())
def AB_results(): # AdaBoostClassifier
	print "--------------AdaBoostClassifier-----------------"
	rang = [60, 80]
	
	# print "--------------With HOG-----------------"
	# ans = []
	# print "n_estimators	Accuracy"
	# for i in rang:
	# 	clf = AdaBoostClassifier(n_estimators=i)
	# 	clf.fit(X_train_hog, y_train)
	# 	mean_accuracy = clf.score(X_test_hog, y_test)
	# 	print i, "	", mean_accuracy
	# 	ans.append('('+str(i)+", "+str(mean_accuracy)+')')
	# print ans

	# plt.plot(rang, ans, linewidth=2.0)
	# plt.xlabel("n_estimators")
	# plt.ylabel("mean_accuracy")
	# plt.savefig("temp_hog.png")

	
	print "\n--------------Without HOG-----------------"
	ans = []
	print "n_estimators	Accuracy"
	for i in rang:
		clf = AdaBoostClassifier(n_estimators=i)
		clf.fit(X_train, y_train)
		mean_accuracy = clf.score(X_test, y_test)
		print i, "	", mean_accuracy
		ans.append('('+str(i)+", "+str(mean_accuracy)+')')
	print ans
	plt.plot(rang, ans, linewidth=2.0)
	plt.xlabel("n_estimators")
	plt.ylabel("mean_accuracy")
	plt.savefig("temp_plain.png")
예제 #16
0
def boost_report(test_split_size):
  scd_count = 0
  for record in records:
    if (record.my_class == "SCD"):
      scd_count += 1
  print(scd_count)

  shuffle(records)
  split = int(len(records) * (1 / test_split_size))
  print(len(records))
  train_set = records[:(len(records) - split)]
  test_set = records[split:]
  print("split:", test_split_size, "train:", len(train_set), "test:", split)

  svm_train_features = list()
  svm_train_classes = list()
  svm_test_features = list()
  svm_test_classes = list()

  for record in train_set:
    svm_train_features.append(list(record.features.values()))
    svm_train_classes.append(record.my_class)
  for record in test_set:
    svm_test_features.append(list(record.features.values()))
    svm_test_classes.append(record.my_class)

  svm_classifier = svm.SVC(kernel="linear", C=0.1)
  svm_classifier.fit(svm_train_features, svm_train_classes)
  print("linear kernel svm accuracy: " +
        str(svm_classifier.score(svm_test_features, svm_test_classes)))

  classifier = AdaBoostClassifier(
    base_estimator=svm_classifier,
    n_estimators=50,
    algorithm='SAMME'
  )
  classifier.fit(svm_train_features, svm_train_classes)
  print("adaboost accuracy: " +
        str(classifier.score(svm_test_features, svm_test_classes)))

  classifier2 = AdaBoostClassifier(
    n_estimators=50,
    algorithm='SAMME'
  )
  classifier2.fit(svm_train_features, svm_train_classes)
  print("adaboost2 accuracy: " +
        str(classifier2.score(svm_test_features, svm_test_classes)))
예제 #17
0
def test_iris():
    """Check consistency on dataset iris."""
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)
        clf.fit(iris.data, iris.target)
        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
            (alg, score)
예제 #18
0
    def run_cv_model(self, max_depth=3, criterion='entropy', learning_rate=1., n_estimators=300, do_plot=True):
        
        # use k-fold cross validation
                
        # Supported criteria are gini for the Gini impurity and entropy for the information gain.
        tree = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=0)
        
        clf = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimators, 
                                 learning_rate=learning_rate,
                                 random_state=0)
        
        # resample the test data without replacement. This means that each data point is part of a test a
        # training set only once. (paraphrased from Raschka p.176). In Stratified KFold, the features are
        # evenly disributed such that each test and training set is an accurate representation of the whole
    
        # this is the 0.17 version
        #kfold = StratifiedKFold(y=self.y_train, n_folds=self.cv, random_state=0)
        
        # this is the 0.18dev version
        skf = StratifiedKFold(n_folds=self.cv, random_state=0)

        
        # do the cross validation
        train_scores = []
        test_scores = []
        #for k, (train, test) in enumerate(kfold):
        for k, (train, test) in enumerate(skf.split(X=self.x_train, y=self.y_train)):
            
            # run the learning algorithm
            clf.fit(self.x_train[train], self.y_train[train])
            train_score = clf.score(self.x_train[test], self.y_train[test])
            train_scores.append(train_score)
            test_score = clf.score(self.x_test, self.y_test)
            test_scores.append(test_score)
            print('Fold:', k+1, ', Training score:', train_score, ', Test score:', test_score)
        
        train_score = np.mean(train_scores)
        print('Training score is', train_score)
        
        test_score = np.mean(test_scores)
        print('Test score is', test_score)
        
        if do_plot:
            self.__plot_learning_curve(clf)
            
        return train_score, test_score  
예제 #19
0
def model_design(run_as_main=False):

    from skimage.data import imread
    from skimage.filters import threshold_adaptive
    from skimage.restoration import denoise_tv_bregman

    from sklearn.cross_validation import train_test_split, StratifiedKFold

    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
    from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier

    labels = pd.read_csv('../data/trainLabels.csv', sep=',')
    X, y = [], np.array(labels.Class)


    for ID in labels.ID:
        original = imread('../data/trainResized/' + str(ID) +'.Bmp', as_grey=True)
        denoised = denoise_tv_bregman(original, 3)
        binarilized = threshold_adaptive(denoised, block_size=13, method='gaussian')
        feature = binarilized.reshape(1,400)[0]
        X.append(feature)
    X = np.array(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    clf = AdaBoostClassifier(base_estimator=
                             ExtraTreesClassifier(
                                 n_estimators=500,
                                 criterion='entropy',
                                 class_weight='auto',
                                 n_jobs=-1
                             ), n_estimators=50)

    # clf = AdaBoostClassifier(base_estimator=
    #                          RandomForestClassifier(
    #                              n_estimators=500,
    #                              criterion='entropy',
    #                              class_weight='auto',
    #                              n_jobs=-1
    #                          ), n_estimators=20)

    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
def AdaBoost(X, y, tst_size, n_est):
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = tst_size, random_state = 0)
	clf = AdaBoostClassifier(n_estimators = n_est)
	score = 0
	for i in range(100):
		clf.fit(X_train, y_train)
		score += clf.score(X_test, y_test)
	score = score/100
	return score
def test_sample_weight_elm():
    """Smoke test - AdaBoostClassifier should work with ELMClassifer."""
    X = Xdigits_binary[:50]
    y = ydigits_binary[:50]

    elm = ELMClassifier(n_hidden=20)
    clf = AdaBoostClassifier(n_estimators=3, base_estimator=elm)
    clf.fit(X, y)
    assert_greater(clf.score(X, y), 0.9)
예제 #22
0
def performAdaBoostClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
	"""
	Ada boosting binary classification
	"""
	clf = AdaBoostClassifier()
	clf.fit(X_train, y_train)
	accuracy = clf.score(X_test, y_test)

	return accuracy
예제 #23
0
def ab_classify(self):
	print "Adaboost"
	clf = AdaBoostClassifier()
	clf.fit(self.descr, self.target)
	mean = clf.score(self.test_descr, self.test_target)
	pred = clf.predict(self.test_descr)

	print "Pred ", pred
	print "Mean : %3f" % mean
	print "Feature Importances ", clf.feature_importances_
def test_staged_predict():
    # Check staged predictions.
    rng = np.random.RandomState(0)
    iris_weights = rng.randint(10, size=iris.target.shape)
    boston_weights = rng.randint(10, size=boston.target.shape)

    # AdaBoost classification
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, n_estimators=10)
        clf.fit(iris.data, iris.target, sample_weight=iris_weights)

        predictions = clf.predict(iris.data)
        staged_predictions = [p for p in clf.staged_predict(iris.data)]
        proba = clf.predict_proba(iris.data)
        staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
        score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
        staged_scores = [
            s for s in clf.staged_score(
                iris.data, iris.target, sample_weight=iris_weights)]

        assert_equal(len(staged_predictions), 10)
        assert_array_almost_equal(predictions, staged_predictions[-1])
        assert_equal(len(staged_probas), 10)
        assert_array_almost_equal(proba, staged_probas[-1])
        assert_equal(len(staged_scores), 10)
        assert_array_almost_equal(score, staged_scores[-1])

    # AdaBoost regression
    clf = AdaBoostRegressor(n_estimators=10, random_state=0)
    clf.fit(boston.data, boston.target, sample_weight=boston_weights)

    predictions = clf.predict(boston.data)
    staged_predictions = [p for p in clf.staged_predict(boston.data)]
    score = clf.score(boston.data, boston.target, sample_weight=boston_weights)
    staged_scores = [
        s for s in clf.staged_score(
            boston.data, boston.target, sample_weight=boston_weights)]

    assert_equal(len(staged_predictions), 10)
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert_equal(len(staged_scores), 10)
    assert_array_almost_equal(score, staged_scores[-1])
예제 #25
0
def classify(features_train, labels_train, features_test, labels_test):
    
    adaBoost = AdaBoostClassifier()
    adaBoost.fit(features_train, labels_train)
    aBScore = adaBoost.score(features_test, labels_test)
    #aBScore = 0
    #print("Ada Boost:     ", aBScore)
    #%timeit adaBoost.fit(features_train, labels_train)
    
    adaBoostCust = AdaBoostCustom()
    adaBoostCust.fit(features_train, labels_train)
    aBCScore = adaBoostCust.score(features_test, labels_test)
    #aBCScore = 0
    #print("AdaBoost Custom: ", aBCScore)
    #%timeit adaBoostCust.fit(features_train, labels_train)

    decisionTree = DecisionTreeClassifier(random_state=0)
    decisionTree.fit(features_train, labels_train)
    dTScore = decisionTree.score(features_test, labels_test)
    #dTScore = 0
    #print("decision Tree:  ", dTScore)
    #%timeit decisionTree.fit(features_train, labels_train)
    
    logReg = LogisticRegression()
    logReg.fit(features_train, labels_train)
    logRegScore = logReg.score(features_test, labels_test)
    #logRegScore = 0
    #print("logReg Score: ", logRegScore)
    #%timeit logReg.fit(features_train, labels_train)
    
    logRegCust = LogisticRegressionCustom()
    logRegCust.fitMulticlassOneVsOne(addOnesCol(features_train), labels_train, alpha = 0.1, nrIt = 800)
    logRegCustScore = logRegCust.scoreMulticlassOneVsOne(addOnesCol(features_test), labels_test)
    #logRegCustScore = 0
    #print("LogRegCust Score: ", logRegCustScore)
    #%timeit logRegCust.fitMulticlass(features_train, labels_train)
    
    linReg = LinearRegression()
    linReg.fit(features_train, labels_train)
    pred = linReg.predict(features_test)
    linRegScore = scoreForLinReg(pred, labels_test)
    #linRegScore = linReg.score(features_test, labels_test)
    #linRegScore = 0
    
    linRegCust = LinearLeastSquares(features_train, number_iteration=800, feature_normalizer=True)
    linRegCust.fit(labels_train)
    linRegCustScore = linRegCust.score(features_test, labels_test)
    #linRegCustScore = 0
    
    locWeigRegCust = LocalWeightedRegressionCustom()
    locWeigRegCustScore = locWeigRegCust.score(features_train, labels_train, features_test, labels_test, 1)
    #locWeigRegCustScore = 0

    return aBScore, aBCScore, dTScore, logRegScore, logRegCustScore, linRegScore, linRegCustScore, locWeigRegCustScore
예제 #26
0
def performAdaBoostClass(X_train, y_train, X_test, y_test, parameters):
    """
    Ada Boosting binary Classification
    """
    n = parameters[0]
    l =  parameters[1]
    clf = AdaBoostClassifier(n_estimators = n, learning_rate = l)
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    #auc = roc_auc_score(y_test, clf.predict(X_test))
    return accuracy
예제 #27
0
파일: voting.py 프로젝트: pigeatshit/Kaggle
def voting():
    X, y = preprocess()
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    clfs = []
    clfs.append(RandomForestClassifier(n_estimators=500, criterion="entropy", class_weight="auto", n_jobs=-1))
    clfs.append(RandomForestClassifier(n_estimators=200, criterion="entropy", class_weight="auto", n_jobs=-1))
    clfs.append(RandomForestClassifier(n_estimators=500, criterion="gini", class_weight="auto", n_jobs=-1))
    clfs.append(RandomForestClassifier(n_estimators=200, criterion="gini", class_weight="auto", n_jobs=-1))
    clfs.append(ExtraTreesClassifier(n_estimators=500, criterion="entropy", class_weight="auto", n_jobs=-1))
    clfs.append(ExtraTreesClassifier(n_estimators=200, criterion="entropy", class_weight="auto", n_jobs=-1))
    clfs.append(ExtraTreesClassifier(n_estimators=500, criterion="gini", class_weight="auto", n_jobs=-1))
    clfs.append(ExtraTreesClassifier(n_estimators=200, criterion="gini", class_weight="auto", n_jobs=-1))

    ab = AdaBoostClassifier(
        base_estimator=ExtraTreesClassifier(n_estimators=500, criterion="entropy", class_weight="auto", n_jobs=-1),
        n_estimators=10,
    )
    ab.fit(X_train, y_train)
    print ab.score(X_test, y_test)
    sys.exit()

    for clf in clfs:
        clf.fit(X_train, y_train)
    for clf in clfs:
        print clf.score(X_train, y_train), clf.score(X_test, y_test)

    y_pred = []
    test_num = len(y_test) / 10

    pbar_cnt = 0
    widgets = ["Predicting...", Percentage(), " ", Bar(marker=RotatingMarker()), " ", ETA()]
    pbar = ProgressBar(widgets=widgets, maxval=test_num).start()
    for i in xrange(test_num):
        pbar_cnt += 1
        pbar.update(pbar_cnt)
        prediction = [clf.predict(X_test[i])[0] for clf in clfs]
        y_pred.append(most_freq_term(prediction))
    pbar.finish()

    print accuracy_score(y_test[:test_num], y_pred)
예제 #28
0
 def run_model(self, max_depth=3, criterion='entropy', learning_rate=1., n_estimators=300, do_plot=True):
     
     # Supported criteria for tree are gini for the Gini impurity and entropy for the information gain.
     tree = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=0)
     clf = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimators, 
                              learning_rate=learning_rate,
                              random_state=0)
    
     clf.fit(self.x_train, self.y_train)
     
     # check model accuracy
     train_score = clf.score(self.x_train, self.y_train)
     print('Training score is', train_score)
     
     test_score = clf.score(self.x_test, self.y_test)
     print('Test score is', test_score)
     
     if do_plot:
         self.__plot_learning_curve(clf)
     
     return train_score, test_score
예제 #29
0
def test_classifiers2(data, ind):
    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators=100)
    clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1])
    print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1])
    out = clf.predict(data[ind[1000:], :-1])
    print(confusion_matrix(data[ind[1000:], -1], out))

    from sklearn.ensemble import GradientBoostingClassifier
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
    clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1])
    print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1])
    out = clf.predict(data[ind[1000:], :-1])
    print(confusion_matrix(data[ind[1000:], -1], out))

    from sklearn.neural_network import MLPClassifier
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10), random_state=1)
    clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1])
    print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1])
    out = clf.predict(data[ind[1000:], :-1])
    print(confusion_matrix(data[ind[1000:], -1], out))

    import xgboost as xgb
    xgb_model = xgb.XGBClassifier().fit(data[ind[:1000], :-1], data[ind[:1000], -1])
    out = xgb_model.predict(data[ind[1000:], :-1])
    a = confusion_matrix(data[ind[1000:], -1], out)
    print float(a[0, 0] + a[1, 1]) / np.sum(a)
    print a
예제 #30
0
def boosting(train_x, train_y, test_x, test_y, n_estimators, iterations):
    name = (
        "Results/adaboost_"
        + str(n_estimators)
        + "_results"
        + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        + ".csv"
    )
    file = open(name, "w")
    file.write(
        "AdaBoost w/ n_estimators = "
        + str(n_estimators)
        + " Analysis Started on "
        + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    )
    file.write("Iteration, Instances, Train Time, Test Time, Training Accuracy, Testing Accuracy")

    logging.info("Starting Boosting Analysis")
    outer_time = datetime.datetime.now()
    boost = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=19), n_estimators=n_estimators)
    for i in range(iterations):
        sample_size = int(random.uniform(0.001, 1.0) * train_y.shape[0])
        index = random.sample(xrange(0, train_y.shape[0]), sample_size)
        start = datetime.datetime.now()
        boost.fit(train_x[index], train_y[index])
        end = datetime.datetime.now()
        train_time = end - start
        train_score = boost.score(train_x, train_y)
        start = datetime.datetime.now()
        test_score = boost.score(test_x, test_y)
        test_time = datetime.datetime.now() - start

        file.write(
            "%4d, %4d, %s, %s, %2.6f, %2.6f \n" % (i, len(index), train_time, test_time, train_score, test_score)
        )
    logging.info("Analysis completed in %s" % (datetime.datetime.now() - outer_time))
    file.close()
예제 #31
0
bagging_clf3.fit(X, y)
# print(bagging_clf3.oob_score_)# 0.856

from sklearn.ensemble import RandomForestClassifier
tf_clf = RandomForestClassifier(n_estimators=500,
                                max_leaf_nodes=16,
                                oob_score=True,
                                random_state=666,
                                n_jobs=-1)
tf_clf.fit(X, y)
# print('tf',tf_clf.oob_score_) #0.92

from sklearn.ensemble import ExtraTreesClassifier
et_clf = ExtraTreesClassifier(n_estimators=500,
                              bootstrap=True,
                              oob_score=True,
                              random_state=666,
                              n_jobs=-1)
et_clf.fit(X, y)
# print('et',et_clf.oob_score_) #0.892

from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                             n_estimators=500)
ada_clf.fit(X_train, y_train)
print(ada_clf.score(X_test, y_test))  # 0.832

from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(max_depth=2, n_estimators=30)
gb_clf.fit(X_train, y_train)
print(gb_clf.score(X_test, y_test))  # 0.912
        training_classes = input_data.loc[training_indices, 'class'].values

        testing_features = input_data.loc[testing_indices].drop('class',
                                                                axis=1).values
        testing_classes = input_data.loc[testing_indices, 'class'].values

        ss = StandardScaler()
        training_features = ss.fit_transform(training_features.astype(float))
        testing_features = ss.transform(testing_features.astype(float))

        # Create and fit the model on the training data
        try:
            clf = AdaBoostClassifier(learning_rate=learning_rate,
                                     n_estimators=n_estimators)
            clf.fit(training_features, training_classes)
            testing_score = clf.score(testing_features, testing_classes)
        except KeyboardInterrupt:
            sys.exit(1)
        except:
            continue

        param_string = ''
        param_string += 'learning_rate={},'.format(learning_rate)
        param_string += 'n_estimators={}'.format(n_estimators)

        out_text = '\t'.join([
            dataset.split('/')[-1][:-7], 'AdaBoostClassifier', param_string,
            str(testing_score)
        ])

        print(out_text)
    if data < 10000.0:
        data = 1
    elif data < 50000.0:
        data = 2
    elif data < 250000.0:
        data = 3
    elif data < 1000000.0:
        data = 4
    else:
        data = 5

    testTarget[x] = data
    x += 1

viewPredict = AdaBoostClassifier(n_estimators=400, learning_rate=.7)

viewPredict.fit(trainData, trainTarget)
prediction = viewPredict.predict(testData)

sampleTest = []

print(viewPredict.score(testData, testTarget))
disp = plot_confusion_matrix(viewPredict,
                             testData,
                             testTarget,
                             normalize='true')

print("Confusion Matrix")
print(disp.confusion_matrix)

dump(viewPredict, 'Prediction.joblib')
예제 #34
0
estimators = bags.estimators_

tree.export_graphviz(estimators[0], out_file='tree_balanced_1.dot')
tree.export_graphviz(estimators[9], out_file='tree_balanced_10.dot')
## se puede visualizar copiando y pegando el contenido del archivo .dot en http://www.webgraphviz.com/

## predicciones y probabilidades en la muestra de entrenamiento
pred_train = bags.predict(X_train)  #predicción
probs_train = bags.predict_proba(X_train)  #probabilidad

#calculemos el ajuste accuracy del modelo en train y en test

probs2 = bags.predict_proba(X_test)
#una vez que ya tengo las predicciones puedo calcular matriz de confusión, accuracy, recall, precision, auc, etc etc

score_train = bags.score(X_train, y_train)
score_testing = bags.score(X_test, y_test)

#para calcular otras métricas precision, recall, etc necesito también predecir en testing

pred_testing = bags.predict(X_test)  #predigo en base testing

#calculo recall, precision, auc, fpr, tpr, auc, f1 etc en ambas bases
fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train,
                                                           pred_train,
                                                           pos_label=1)
fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test,
                                                        pred_testing,
                                                        pos_label=1)

auc_train = metrics.auc(fpr_train, tpr_train)
        aggClassEst = np.zeros([m, 1])
        for i in range(len(self.weakclass)):
            classEst = self.stumpClassify(X,self.weakclass[i]['dim'],\
                                 self.weakclass[i]['thresh'],\
                                 self.weakclass[i]['ineq'])
            aggClassEst += self.weakclass[i]['alpha'] * classEst
        return np.sign(aggClassEst)

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        m = X.shape[0]
        y_test = y_test.reshape(-1, 1)
        count = np.ones([m, 1])
        count[y_pred != y_test] = 0
        accuracy = np.sum(count) / m
        return accuracy


#%%
X = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).reshape(-1, 1)
y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])
#%%
clf1 = AdaBoost(30, 1)
clf1.fit(X, y)
accuracy1 = clf1.score(X, y)
#%%
from sklearn.ensemble import AdaBoostClassifier
clf2 = AdaBoostClassifier(n_estimators=100, random_state=0)
clf2.fit(X, y)
accuracy2 = clf2.score(X, y)
예제 #36
0
y = train["class"]
clh = tree.DecisionTreeClassifier(max_depth=7)
clf = AdaBoostClassifier(base_estimator=clh, n_estimators=10)
clf.fit(x, y)

print(np.mean(cross_val_score(clf, x, y, cv=8)))
print(confusion_matrix(y, clf.predict(x)))
#test_x = test[["meanfreq","sd","median","Q25","Q75","IQR","skew","kurt","sp.ent","sfm","mode",
#               "centroid","peakf","meanfun","minfun",
#       "maxfun","meandom","mindom","maxdom","dfrange","modindx"]]
test_x = test[[
    "meanfreq", "sd", "freq.median", "freq.Q25", "freq.Q75", "freq.IQR",
    "skew", "kurt", "sp.ent", "sfm", "meanfun", "minfun", "maxfun", "meandom",
    "mindom", "maxdom", "dfrange", "modindx", "dfslope", "meanpeakf"
]]
print(clf.score(test_x, test["class"]))

#---------------------------------------------------------------------------#

svcfit = SVC(C=0.01, kernel='linear')
x = preprocessing.scale(x)

svcfit.fit(x, y)

print(np.mean(cross_val_score(svcfit, x, y, cv=8)))
print(confusion_matrix(y, svcfit.predict(x)))
test_x = preprocessing.scale(test_x)
print(svcfit.score(test_x, test["class"]))

#---------------------------------------------------------------------------#
X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]

bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                              n_estimators=600,
                              learning_rate=1)

bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                                  n_estimators=600,
                                  learning_rate=1.5,
                                  algorithm="SAMME")

bdt_real.fit(X_train, y_train)
bdt_discrete.fit(X_train, y_train)
print bdt_real.score(X_test, y_test)

real_test_errors = []
discrete_test_errors = []

for real_test_predict, discrete_train_predict in zip(
        bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):
    real_test_errors.append(1. - accuracy_score(real_test_predict, y_test))
    discrete_test_errors.append(1. -
                                accuracy_score(discrete_train_predict, y_test))

n_trees_discrete = len(bdt_discrete)
n_trees_real = len(bdt_real)

# Boosting might terminate early, but the following arrays are always
# n_estimators long. We crop them to the actual number of trees here:
예제 #38
0
    # df_train = pd.read_csv("data/training_set_VU_DM.csv")
    df_test = pd.read_csv("data/test_short.csv")

    data, df_test = prep_data(df_train, df_test)

    predictors = [
        c for c in data.columns if c not in
        ["booking_bool", "click_bool", "gross_bookings_usd", "position"]
    ]

    X = data[predictors]
    y = data.booking_bool.astype(int)

    clf = AdaBoostClassifier(n_estimators=100)
    training = clf.fit(X, y)
    score = clf.score(X, y)
    print(score, "score")
    scores = cross_val_score(clf, X, y, cv=5)
    print("mean score: ", scores.mean())
    print("ada scores:")
    print(scores)

    prediction_test_set = clf.predict(df_test)
    predictions = pd.DataFrame({
        'hotel_id': df_test.prop_id,
        'search_id': df_test.srch_id,
        'booking_prob': prediction_test_set
    })
    predictions.to_csv('wattt.csv', index=False)

    clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
예제 #39
0
            pickle.dump(training_labels, f)

    if not os.path.isfile(DataFile_test):
        with open(DataFile_test, "wb") as f:
            pickle.dump(test_data, f)
    if not os.path.isfile(LabelFile_test):
        with open(LabelFile_test, "wb") as f:
            pickle.dump(test_labels, f)
else:
    if os.path.isfile(DataFile) and os.path.isfile(LabelFile):
        with open(DataFile, "rb") as f:
            training_data = pickle.load(f)
        with open(LabelFile, "rb") as f:
            training_labels = pickle.load(f)
        with open(DataFile_test, "rb") as f:
            test_data = pickle.load(f)
        with open(LabelFile_test, "rb") as f:
            test_labels = pickle.load(f)

X_train, X_val, y_train, y_val = model_selection.train_test_split(training_data, training_labels, train_size=.99, test_size=.01)


print("training!")
#boosting = GradientBoostingClassifier(n_estimators=250, loss='exponential', learning_rate=0.2)
#boosting = AdaBoostClassifier(RandomForestClassifier(n_estimators=600, max_depth=15, n_jobs=7), n_estimators=25, learning_rate=.1)
boosting = AdaBoostClassifier(SVC(C=50, gamma=2), n_estimators=25, learning_rate=.1)
boosting.fit(X_train, y_train[:,1])

print("Classifier has a score of %0.4f"
      % (boosting.score(test_data, test_labels[:,1])))
예제 #40
0
def main():
    df = pd.read_csv("Dataset/winequality-white.csv", delimiter=";")

    n, bins, patches = plt.hist(x=np.array(df.iloc[:, -1]),
                                bins='auto',
                                color='#0504aa',
                                alpha=0.7)
    plt.grid(axis='y', alpha=0.75)
    plt.xlabel('Quality')
    plt.ylabel('Count')
    plt.xticks(np.arange(10),
               ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9'))
    plt.title('Class Distribution')
    plt.ylim(ymax=2200)
    plt.savefig('WinePlots/wineClassDistributionOriginal.png')
    plt.close()

    lowquality = df.loc[df['quality'] <= 6].index
    highquality = df.loc[df['quality'] > 6].index
    df.iloc[lowquality, df.columns.get_loc('quality')] = 0
    df.iloc[highquality, df.columns.get_loc('quality')] = 1

    seed = 200
    np.random.seed(seed)

    X = np.array(df.iloc[:, 0:-1])
    Y = np.array(df.iloc[:, -1])

    n, bins, patches = plt.hist(x=Y, bins='auto', color='#0504aa', alpha=0.7)
    plt.grid(axis='y', alpha=0.75)
    plt.xlabel('Quality')
    plt.ylabel('Count')
    plt.xticks(np.arange(2), ('Low', 'High'))
    plt.title('Class Distribution')
    plt.ylim(ymax=4000)
    plt.savefig('WinePlots/wineClassDistribution.png')
    plt.close()

    training_x1, testing_x1, training_y, testing_y = train_test_split(
        X, Y, test_size=0.3, random_state=seed, shuffle=True, stratify=Y)

    standardScalerX = StandardScaler()
    training_x = standardScalerX.fit_transform(training_x1)
    testing_x = standardScalerX.fit_transform(testing_x1)

    # DT Max Depth Gini
    max_depth_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('DT Max Depth Gini')
    for i in range(1, 50):
        max_depth_array.append(i)
        learner = DecisionTreeClassifier(criterion='gini',
                                         max_depth=i + 1,
                                         random_state=seed)
        cross_val_score_array.append(
            cross_val_score(learner, training_x, training_y, cv=10).mean())

        learner.fit(training_x, training_y)
        training_depth_array.append(learner.score(training_x, training_y))
        testing_depth_array.append(learner.score(testing_x, testing_y))

    plot_validation_curve(max_depth_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs Max Depth Gini", 'Score',
                          'Max Depth', [1, 50],
                          'WinePlots/winemaxdepthGini.png')

    # DT Max Depth Entropy

    max_depth_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('DT Max Depth Entropy')
    for i in range(1, 50):
        max_depth_array.append(i)
        learner = DecisionTreeClassifier(criterion='entropy',
                                         max_depth=i + 1,
                                         random_state=seed)
        cross_val_score_array.append(
            cross_val_score(learner, training_x, training_y, cv=10).mean())

        learner.fit(training_x, training_y)
        training_depth_array.append(learner.score(training_x, training_y))
        testing_depth_array.append(learner.score(testing_x, testing_y))

    plot_validation_curve(max_depth_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs Max Depth Entropy",
                          'Score', 'Max Depth', [1, 50],
                          'WinePlots/winemaxdepthEntropy.png')

    # DT Random Search & Learning Curve

    max_depths = np.arange(1, 20, 1)
    params = {'criterion': ['gini', 'entropy'], 'max_depth': max_depths}
    learner = DecisionTreeClassifier(random_state=seed)
    start = time.clock()
    dt_cv = RandomizedSearchCV(learner,
                               n_jobs=1,
                               param_distributions=params,
                               refit=True,
                               n_iter=40)
    dt_cv.fit(training_x, training_y)
    print(dt_cv.best_params_)
    dt_train_time = time.clock() - start
    print('Time to Train: ' + str(dt_train_time))
    print('Training Accuracy: ' + str(dt_cv.score(training_x, training_y)))
    print('Testing Accuracy: ' + str(dt_cv.score(testing_x, testing_y)))
    print(dt_cv.best_params_)  # entropy, max depth 11
    start = time.clock()
    test_y_predicted = dt_cv.predict(testing_x)
    dt_query_time = time.clock() - start
    print('Time to Query: ' + str(dt_query_time))
    y_true = pd.Series(testing_y)
    y_pred = pd.Series(test_y_predicted)
    print(
        pd.crosstab(y_true,
                    y_pred,
                    rownames=['True'],
                    colnames=['Predicted'],
                    margins=True))

    train_sizes, train_scores, test_scores = learning_curve(
        dt_cv,
        training_x,
        training_y,
        n_jobs=-1,
        cv=10,
        train_sizes=np.linspace(.1, 1.0, 10),
        random_state=seed)

    plot_learning_curve(train_scores, test_scores, train_sizes,
                        'WinePlots/wineDTLearningCurve.png',
                        "Learning Curve DT")

    # Adaboost Max Depth

    max_depth_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('Adaboost Max Depth')
    for i in range(1, 20, 2):
        max_depth_array.append(i)
        learner2 = DecisionTreeClassifier(max_depth=i,
                                          criterion='gini',
                                          random_state=seed)
        boosted_learner2 = AdaBoostClassifier(base_estimator=learner2,
                                              random_state=seed,
                                              algorithm='SAMME')
        cross_val_score_array.append(
            cross_val_score(boosted_learner2, training_x, training_y,
                            cv=10).mean())

        boosted_learner2.fit(training_x, training_y)
        training_depth_array.append(
            boosted_learner2.score(training_x, training_y))

    plot_validation_curve(
        max_depth_array, training_depth_array, cross_val_score_array,
        "Cross Validation Score vs Max Depth of Base Estimator", 'Score',
        'Max Depth', [1, 20], 'WinePlots/wineboostedmaxdepth.png')

    # Adaboost Estimators

    estimator_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []
    n_estimators = range(1, 55, 5)

    print('Adaboost Estimators')
    for i in n_estimators:
        estimator_array.append(i)
        boosted_learner2 = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(criterion='gini',
                                                  max_depth=1),
            algorithm='SAMME',
            random_state=seed,
            n_estimators=i)
        cross_val_score_array.append(
            cross_val_score(boosted_learner2, training_x, training_y,
                            cv=10).mean())

        boosted_learner2.fit(training_x, training_y)
        training_depth_array.append(
            boosted_learner2.score(training_x, training_y))

    plot_validation_curve(estimator_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs Number of Estimators",
                          'Score', 'Number of Estimators', [1, 50],
                          'WinePlots/wineboostedestimators.png')

    # Adaboost Learning Rate

    learning_rate_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []
    learning_rates = [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1, 1.5, 2]

    print('Adaboost Learning Rate')
    for i in learning_rates:
        learning_rate_array.append(i)
        boosted_learner2 = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(criterion='gini',
                                                  max_depth=1),
            algorithm='SAMME',
            random_state=seed,
            learning_rate=i)
        cross_val_score_array.append(
            cross_val_score(boosted_learner2, training_x, training_y,
                            cv=10).mean())

        boosted_learner2.fit(training_x, training_y)
        training_depth_array.append(
            boosted_learner2.score(training_x, training_y))

    plot_validation_curve(learning_rate_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs Learning Rates", 'Score',
                          'Learning Rate', [0, 2],
                          'WinePlots/wineboostedLearningRate.png')

    # Adaboost Random Search & Learning Curve

    max_depths = np.arange(1, 20, 1)
    params = {
        'n_estimators': [10, 15, 20, 25, 30, 40, 50],
        'learning_rate': [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1],
        'base_estimator__max_depth': max_depths
    }
    learner = AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(criterion='gini'),
        random_state=seed)

    print('starting grid  search')
    boost_cv = RandomizedSearchCV(learner,
                                  n_jobs=1,
                                  param_distributions=params,
                                  refit=True,
                                  n_iter=50)
    start = time.clock()
    boost_cv.fit(training_x, training_y)
    dt_train_time = time.clock() - start
    print('Time to Train: ' + str(dt_train_time))
    print('Training Accuracy: ' + str(boost_cv.score(training_x, training_y)))
    print('Testing Accuracy: ' + str(boost_cv.score(testing_x, testing_y)))
    print(boost_cv.best_params_)
    start = time.clock()
    test_y_predicted = boost_cv.predict(testing_x)
    dt_query_time = time.clock() - start
    print('Time to Query: ' + str(dt_query_time))
    y_true = pd.Series(testing_y)
    y_pred = pd.Series(test_y_predicted)
    print(
        pd.crosstab(y_true,
                    y_pred,
                    rownames=['True'],
                    colnames=['Predicted'],
                    margins=True))

    train_sizes, train_scores, test_scores = learning_curve(
        boost_cv,
        training_x,
        training_y,
        n_jobs=-1,
        cv=10,
        train_sizes=np.linspace(.1, 1.0, 10),
        random_state=seed)

    plot_learning_curve(train_scores, test_scores, train_sizes,
                        'WinePlots/wineboostedLearningCurve.png')

    # KNN Number of Neighbors

    knn_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('KNN Number of Neighbors with Manhattan Distance')
    for i in range(1, 50, 2):
        knn_array.append(i)
        learner = KNeighborsClassifier(n_neighbors=i, p=1)
        cross_val_score_array.append(
            cross_val_score(learner, training_x, training_y, cv=10).mean())

        learner.fit(training_x, training_y)
        training_depth_array.append(learner.score(training_x, training_y))

    plot_validation_curve(knn_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs k Neighbors Manhattan",
                          'Score', 'k Neighbors', [1, 50],
                          'WinePlots/wineManhattanKNN.png')

    knn_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('KNN Number of Neighbors with Euclidean Distance')
    for i in range(1, 50, 2):
        knn_array.append(i)
        learner = KNeighborsClassifier(n_neighbors=i, p=2)
        cross_val_score_array.append(
            cross_val_score(learner, training_x, training_y, cv=10).mean())

        learner.fit(training_x, training_y)
        training_depth_array.append(learner.score(training_x, training_y))

    plot_validation_curve(knn_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs k Neighbors Euclidean",
                          'Score', 'k Neighbors', [1, 50],
                          'WinePlots/wineEuclideanKNN.png')

    params = {'p': [1, 2], 'n_neighbors': np.arange(2, 50, 1)}
    learner = KNeighborsClassifier()

    print('starting random  search')
    knn_cv = RandomizedSearchCV(learner,
                                n_jobs=1,
                                param_distributions=params,
                                refit=True,
                                n_iter=100)
    start = time.clock()
    knn_cv.fit(training_x, training_y)
    dt_train_time = time.clock() - start
    print('Time to Train: ' + str(dt_train_time))
    print('Training Accuracy: ' + str(knn_cv.score(training_x, training_y)))
    print('Testing Accuracy: ' + str(knn_cv.score(testing_x, testing_y)))
    print(knn_cv.best_params_)
    start = time.clock()
    test_y_predicted = knn_cv.predict(testing_x)
    dt_query_time = time.clock() - start
    print('Time to Query: ' + str(dt_query_time))
    y_true = pd.Series(testing_y)
    y_pred = pd.Series(test_y_predicted)
    print(
        pd.crosstab(y_true,
                    y_pred,
                    rownames=['True'],
                    colnames=['Predicted'],
                    margins=True))

    train_sizes, train_scores, test_scores = learning_curve(
        knn_cv,
        training_x,
        training_y,
        n_jobs=-1,
        cv=10,
        train_sizes=np.linspace(.1, 1.0, 10),
        random_state=seed)

    plot_learning_curve(train_scores, test_scores, train_sizes,
                        'WinePlots/wineKNNLearningCurve.png')

    # ANN 1 Layer with different number of neurons

    ann_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('ANN Number of Neurons')
    for i in [
            1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 70, 80, 90, 100
    ]:
        print('------hey we are on ' + str(i))
        ann_array.append(i)
        learner = MLPClassifier(hidden_layer_sizes=([i]))
        cross_val_score_array.append(
            cross_val_score(learner, training_x, training_y, cv=10).mean())
        learner.fit(training_x, training_y)
        training_depth_array.append(learner.score(training_x, training_y))

    plot_validation_curve(
        ann_array, training_depth_array, cross_val_score_array,
        "Cross Validation Score vs Neurons in One Hidden Layer", 'Score',
        'Number of Neurons', [1, 100], 'WinePlots/wineANNNeurons.png')

    # ANN Neurons per Layers

    ann_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('ANN Number of Layers')
    for i in [1, 3, 5, 8, 10, 11, 13, 15, 17, 20, 23, 25]:
        print('------hey we are on ' + str(i))
        hidden_layers = []
        for x in range(i):
            hidden_layers.append(22)
        ann_array.append(i)
        learner = MLPClassifier(hidden_layer_sizes=(hidden_layers),
                                activation='relu',
                                alpha=0.0051)
        cross_val_score_array.append(
            cross_val_score(learner, training_x, training_y, cv=10).mean())
        learner.fit(training_x, training_y)
        training_depth_array.append(learner.score(training_x, training_y))

    plot_validation_curve(ann_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs # of Hidden Layers",
                          'Score', 'Number of Hidden Layers', [1, 25],
                          'WinePlots/wineANNLayers.png')

    # ANN Learning Curve

    params = {
        'hidden_layer_sizes': [(11, 11), (5, 5), (11, ), (5, ),
                               (22, ), (22, 22), (5, 5, 5), (11, 11, 11),
                               (22, 22, 22)],
        'alpha':
        np.arange(0.0001, 0.01, 0.005),
        'activation': ['relu', 'logistic']
    }
    learner = MLPClassifier(max_iter=500, random_state=seed)
    ##### best params {'hidden_layer_sizes': (11,11), 'alpha': 0.0001, 'activation': 'relu'}
    # ann_cv = MLPClassifier(max_iter=3000,hidden_layer_sizes=(22,22,22), alpha=0.0051, activation='relu', random_state=seed)
    print('starting random  search')
    ann_cv = RandomizedSearchCV(learner,
                                n_jobs=1,
                                param_distributions=params,
                                refit=True,
                                n_iter=20,
                                verbose=1000)
    ann_cv.fit(training_x, training_y)
    print(ann_cv.best_params_)

    final_ann = MLPClassifier(**ann_cv.best_params_)

    start = time.clock()
    final_ann.fit(training_x, training_y)

    dt_train_time = time.clock() - start
    # print('refit time: ' + str(final_ann.refit_time_))
    # print(final_ann.best_params_)
    print('Time to Train: ' + str(dt_train_time))
    print('Training Accuracy: ' + str(final_ann.score(training_x, training_y)))
    print('Testing Accuracy: ' + str(final_ann.score(testing_x, testing_y)))
    # print(final_ann.best_params_)
    start = time.clock()
    test_y_predicted = final_ann.predict(testing_x)
    dt_query_time = time.clock() - start
    print('Time to Query: ' + str(dt_query_time))
    y_true = pd.Series(testing_y)
    y_pred = pd.Series(test_y_predicted)
    print(
        pd.crosstab(y_true,
                    y_pred,
                    rownames=['True'],
                    colnames=['Predicted'],
                    margins=True))

    train_sizes, train_scores, test_scores = learning_curve(
        final_ann,
        training_x,
        training_y,
        n_jobs=-1,
        cv=5,
        train_sizes=np.linspace(.1, 1.0, 10),
        random_state=seed)

    plot_learning_curve(train_scores, test_scores, train_sizes,
                        'WinePlots/wineANNLearningCurve.png')

    # ANN over epochs

    ann_array = []
    training_depth_array = []
    cross_val_score_array = []
    testing_depth_array = []

    learner = MLPClassifier(hidden_layer_sizes=(22, ),
                            alpha=0.0051,
                            activation='relu',
                            max_iter=1,
                            random_state=seed,
                            verbose=10,
                            warm_start=True)
    for i in np.arange(3000):
        ann_array.append(i)
        learner = learner.fit(training_x, training_y)
        score = learner.score(training_x, training_y)
        print(score)
        training_depth_array.append(score)
        cross_score = learner.score(testing_x, testing_y)
        cross_val_score_array.append(cross_score)
        print(cross_score)

    plot_validation_curve(ann_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs. Epochs", 'Score',
                          'Epochs', [0, 3000], 'WinePlots/wineANNEpochs.png')

    # SVM Kernels Sigmoid vs RBF

    svm_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('SVM Kernels Sigmoid Different Gamma Values')
    for i in np.arange(0.01, 2, 0.1):
        print('------hey we are on ' + str(i))
        svm_array.append(i)
        learner = svm.SVC(kernel='sigmoid', gamma=i)
        cross_val_score_array.append(
            cross_val_score(learner, training_x, training_y, cv=10).mean())
        learner.fit(training_x, training_y)
        training_depth_array.append(learner.score(training_x, training_y))

    plt.plot(svm_array, training_depth_array, label='Training')
    plt.plot(svm_array, cross_val_score_array, label='Cross Validation')
    plt.legend(loc=4, fontsize=8)
    plt.title("Cross Validation Score vs. Gamma Values - Sigmoid Kernel")
    plt.ylabel('Score')
    plt.xlabel('Gamma Values')
    plt.xlim([0.00, 2.0])
    plt.savefig('WinePlots/wineGammaSigmoid.png')
    plt.close()

    svm_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('SVM Kernels RBF Different Gamma Values')
    for i in np.arange(0.01, 2, 0.1):
        print('------hey we are on ' + str(i))
        svm_array.append(i)
        learner = svm.SVC(kernel='rbf', gamma=i)
        cross_score = cross_val_score(learner, training_x, training_y,
                                      cv=10).mean()
        print(cross_score)
        cross_val_score_array.append(cross_score)
        learner.fit(training_x, training_y)
        training_depth_array.append(learner.score(training_x, training_y))

    plt.plot(svm_array, training_depth_array, label='Training')
    plt.plot(svm_array, cross_val_score_array, label='Cross Validation')
    plt.legend(loc=4, fontsize=8)
    plt.title("Cross Validation Score vs. Gamma Values - RBF Kernel")
    plt.ylabel('Score')
    plt.xlabel('Gamma Values')
    plt.xlim([0.00, 2.0])
    plt.savefig('WinePlots/wineGammaRBF.png')
    plt.close()

    # SVM C Values

    svm_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('SVM Kernels Sigmoid Different C Values')
    for i in np.arange(0.01, 2, 0.1):
        print('------hey we are on ' + str(i))
        svm_array.append(i)
        learner = svm.SVC(kernel='sigmoid', C=i)
        cross_val_score_array.append(
            cross_val_score(learner, training_x, training_y, cv=10).mean())
        learner.fit(training_x, training_y)
        training_depth_array.append(learner.score(training_x, training_y))

    plot_validation_curve(
        svm_array, training_depth_array, cross_val_score_array,
        "Cross Validation Score vs. C Values - Sigmoid Kernel", 'Score',
        'C Values', [0.00, 2.0], 'WinePlots/wineCSigmoid.png')

    svm_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('SVM Kernels RBF Different C Values')
    for i in np.arange(0.01, 2, 0.1):
        print('------hey we are on ' + str(i))
        svm_array.append(i)
        learner = svm.SVC(kernel='rbf', C=i)
        cross_val_score_array.append(
            cross_val_score(learner, training_x, training_y, cv=10).mean())
        learner.fit(training_x, training_y)
        training_depth_array.append(learner.score(training_x, training_y))

    plot_validation_curve(svm_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs. C Values - RBF Kernel",
                          'Score', 'C Values', [0.00, 2.0],
                          'WinePlots/wineCRBF.png')

    #Learning Curve Sigmoid

    params = {'gamma': np.arange(0.01, 2, 0.1), 'C': np.arange(0.01, 1, 0.1)}
    learner = svm.SVC(kernel='sigmoid')

    print('starting grid  search')
    svc_cv = RandomizedSearchCV(learner,
                                n_jobs=1,
                                param_distributions=params,
                                refit=True,
                                n_iter=50)
    svc_cv.fit(training_x, training_y)
    best_params = svc_cv.best_params_  #{'gamma': 0.51, 'C': 0.01}
    print(best_params)
    final_svc = svm.SVC(kernel='sigmoid', **best_params)
    final_svc.fit(training_x, training_y)
    print(final_svc.score(testing_x, testing_y))
    print(final_svc.score(training_x, training_y))
    test_y_predicted = final_svc.predict(testing_x)
    y_true = pd.Series(testing_y)
    y_pred = pd.Series(test_y_predicted)
    print(
        pd.crosstab(y_true,
                    y_pred,
                    rownames=['True'],
                    colnames=['Predicted'],
                    margins=True))

    train_sizes, train_scores, test_scores = learning_curve(
        final_svc,
        training_x,
        training_y,
        n_jobs=-1,
        cv=10,
        train_sizes=np.linspace(.1, 1.0, 10),
        random_state=seed)

    plot_learning_curve(train_scores, test_scores, train_sizes,
                        'WinePlots/wineSVCLearningCurveSigmoid.png')

    # Learning Curve RBF

    params = {'gamma': np.arange(0.01, 2, 0.1), 'C': np.arange(0.01, 1, 0.1)}
    learner = svm.SVC(kernel='rbf')

    print('starting grid  search')
    svc_cv = RandomizedSearchCV(learner,
                                n_jobs=1,
                                param_distributions=params,
                                refit=True,
                                n_iter=50)
    svc_cv.fit(training_x, training_y)
    best_params = svc_cv.best_params_  #{'gamma': 1.31, 'C': 0.91}
    print(best_params)
    final_svc = svm.SVC(kernel='rbf', **best_params)
    final_svc.fit(training_x, training_y)
    print(final_svc.score(testing_x, testing_y))
    print(final_svc.score(training_x, training_y))
    test_y_predicted = final_svc.predict(testing_x)
    y_true = pd.Series(testing_y)
    y_pred = pd.Series(test_y_predicted)
    print(
        pd.crosstab(y_true,
                    y_pred,
                    rownames=['True'],
                    colnames=['Predicted'],
                    margins=True))

    train_sizes, train_scores, test_scores = learning_curve(
        final_svc,
        training_x,
        training_y,
        n_jobs=-1,
        cv=10,
        train_sizes=np.linspace(.1, 1.0, 10),
        random_state=seed)

    plot_learning_curve(train_scores, test_scores, train_sizes,
                        'WinePlots/wineSVCLearningCurveRBF.png')

    # SVM over Epochs Sigmoid

    svm_array = []
    training_depth_array = []
    testing_depth_array = []
    cross_val_score_array = []

    print('SVM Different Epochs Sigmoid')
    for i in np.arange(1000):
        svm_array.append(i)
        learner = svm.SVC(kernel='sigmoid', verbose=100, max_iter=i)
        learner = learner.fit(training_x, training_y)
        score = learner.score(training_x, training_y)
        print(score)
        training_depth_array.append(score)
        cross_score = learner.score(testing_x, testing_y)
        cross_val_score_array.append(cross_score)

    plot_validation_curve(svm_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs. Epochs", 'Score',
                          'Epochs', [0, 1000],
                          'WinePlots/wineSVMEpochsSigmoid.png')

    # SVM over Epochs RBF
    svm_array = []
    training_depth_array = []
    cross_val_score_array = []

    print('SVM Different Epochs RBF')
    for i in np.arange(1000):
        svm_array.append(i)
        learner = svm.SVC(kernel='rbf', verbose=100, max_iter=i)
        learner = learner.fit(training_x, training_y)
        score = learner.score(training_x, training_y)
        print(score)
        training_depth_array.append(score)
        cross_score = learner.score(testing_x, testing_y)
        cross_val_score_array.append(cross_score)

    plot_validation_curve(svm_array, training_depth_array,
                          cross_val_score_array,
                          "Cross Validation Score vs. Epochs", 'Score',
                          'Epochs', [0, 1000],
                          'WinePlots/wineSVMEpochsRBF.png')

    # Timing Wine

    # Training Time
    dt_clf = DecisionTreeClassifier(max_depth=19, criterion='gini')
    ada_clf = AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(max_depth=11),
        n_estimators=40,
        learning_rate=1)
    knn_clf = KNeighborsClassifier(p=1, n_neighbors=2)
    ann_clf = MLPClassifier(hidden_layer_sizes=(22, ),
                            alpha=0.0051,
                            activation='relu')
    svm_rbf_clf = svm.SVC(kernel='rbf', gamma=1.31, C=0.91)
    svm_sigmoid_clf = svm.SVC(kernel='sigmoid', gamma=0.51, C=0.01)
    labels = [
        "Decision Tree", "Adaboost", "KNN", "ANN", "SVM_RBF", "SVM_Sigmoid"
    ]
    count = 0

    for clf in [
            dt_clf, ada_clf, knn_clf, ann_clf, svm_rbf_clf, svm_sigmoid_clf
    ]:
        iteration_array = []
        train_array = []
        query_array = []
        for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
            if count == 3:
                clf = MLPClassifier(hidden_layer_sizes=(22, ),
                                    alpha=0.0051,
                                    activation='relu')
            if count == 4:
                clf = svm.SVC(kernel='rbf', gamma=1.31, C=0.91)
            if count == 5:
                clf = svm.SVC(kernel='sigmoid', gamma=0.51, C=0.01)
            X_train = training_x[:int(training_x.shape[0] * i), :]
            Y_train = training_y[:int(training_y.shape[0] * i)]
            iteration_array.append(X_train.shape[0])
            st = time.clock()
            clf.fit(X_train, Y_train)
            train_time = time.clock() - st
            train_array.append(train_time)
        plt.plot(iteration_array, train_array, label=labels[count])
        plt.legend(loc=4, fontsize=8)
        plt.title("Training Times for Learners", fontdict={'size': 16})
        plt.ylabel("Time")
        plt.xlabel("Iteration Size")
        count = count + 1
    plt.savefig("WineTrainingTimes.png")
    plt.close()

    # Query Time

    dt_clf = DecisionTreeClassifier(max_depth=19, criterion='gini')
    ada_clf = AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(max_depth=11),
        n_estimators=40,
        learning_rate=1)
    knn_clf = KNeighborsClassifier(p=1, n_neighbors=2)
    ann_clf = MLPClassifier(hidden_layer_sizes=(22, ),
                            alpha=0.0051,
                            activation='relu')
    svm_rbf_clf = svm.SVC(kernel='rbf', gamma=1.31, C=0.91)
    svm_sigmoid_clf = svm.SVC(kernel='sigmoid', gamma=0.51, C=0.01)
    labels = [
        "Decision Tree", "Adaboost", "KNN", "ANN", "SVM_RBF", "SVM_Sigmoid"
    ]
    count = 0
    for clf in [
            dt_clf, ada_clf, knn_clf, ann_clf, svm_rbf_clf, svm_sigmoid_clf
    ]:
        iteration_array = []
        query_array = []
        clf.fit(training_x, training_y)
        for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
            X_test = testing_x[:int(testing_x.shape[0] * i), :]
            iteration_array.append(X_test.shape[0])
            st = time.clock()
            clf.predict(X_test)
            query_time = time.clock() - st
            query_array.append(query_time)
        plt.plot(iteration_array, query_array, label=labels[count])
        plt.legend(loc=4, fontsize=8)
        plt.title("Query Times for Learners", fontdict={'size': 16})
        plt.ylabel("Time")
        plt.xlabel("Iteration Size")
        count = count + 1
    plt.savefig("WineQueryTimes.png")
    plt.close()
예제 #41
0
    random_forest_list = []
    for i in range(len(size_list)):
        np.random.shuffle(x_y_set)
        RandomForestClassifierModel.fit(x_y_set[:size_list[i], :2],
                                        x_y_set[:size_list[i], 2])
        random_forest_list.append(
            RandomForestClassifierModel.score(X_test, Y_test))

    # AdaBoost
    AdaBoostClassifierModel = AdaBoostClassifier()
    ada_boost_list = []
    for i in range(len(size_list)):
        np.random.shuffle(x_y_set)
        AdaBoostClassifierModel.fit(x_y_set[:size_list[i], :2],
                                    x_y_set[:size_list[i], 2])
        ada_boost_list.append(AdaBoostClassifierModel.score(X_test, Y_test))

    # LogisticRegression
    LogisticRegressionModel = LogisticRegression()
    logistic_regression_list = []
    for i in range(len(size_list)):
        np.random.shuffle(x_y_set)
        LogisticRegressionModel.fit(x_y_set[:size_list[i], :2],
                                    x_y_set[:size_list[i], 2])
        logistic_regression_list.append(
            LogisticRegressionModel.score(X_test, Y_test))

    # MLPClassifier
    MLPClassifierModel = MLPClassifier()
    neural_network_list = []
    for i in range(len(size_list)):
예제 #42
0
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
"""
0.10030303030303031 {'n_neighbors': 5, 'p': 1}
0.10242424242424245 {'n_neighbors': 5, 'p': 2}
0.12121212121212119 {'n_neighbors': 15, 'p': 1}
0.11787878787878787 {'n_neighbors': 15, 'p': 2}
0.11424242424242426 {'n_neighbors': 30, 'p': 1}
0.11363636363636362 {'n_neighbors': 30, 'p': 2}
"""
# %%
ada_clf = AdaBoostClassifier(n_estimators=100)
ada_clf.fit(X_train, y_train)

pred_train, pred_dev = ada_clf.predict(X_train), ada_clf.predict(X_dev)
train_acc = ada_clf.score(X_train, y_train)
dev_acc = ada_clf.score(X_dev, y_dev)
train_uar = recall_score(y_train, pred_train, average='macro')
dev_uar = recall_score(y_dev, pred_dev, average='macro')

print(f"train_acc = {train_acc:.2f}, dev_acc = {dev_acc:.2f}")
print(f"train_uar = {train_uar:.2f}, dev_uar = {dev_uar:.2f}")
"""train_acc = 0.29, dev_acc = 0.24
train_uar = 0.29, dev_uar = 0.25"""

# %%
gboost_clf = GradientBoostingClassifier(n_estimators=100)
gboost_clf.fit(X_train, y_train)

pred_train, pred_dev = gboost_clf.predict(X_train), gboost_clf.predict(X_dev)
train_acc = gboost_clf.score(X_train, y_train)
예제 #43
0
model_gnb = GaussianNB()
model_gnb.fit(xtr, ytr)
bayes_score = model_gnb.score(xte, yte)

model_lr = LogisticRegression()
model_lr.fit(xtr, ytr)
logreg_score = model_lr.score(xte, yte)

model_rc = RidgeClassifier()
model_rc.fit(xtr, ytr)
ridge_score = model_rc.score(xte, yte)

#model_sgd = SGDClassifier();model_sgd.fit(xtr,ytr)
#sgd_score = model_sgd.score(xte,yte)
#%%
xtr, xte, ytr, yte = train_test_split(X, y, train_size=0.75)
model_gbc = GradientBoostingClassifier(learning_rate=0.01, n_estimators=256)
model_gbc.fit(xtr, ytr)
gbc_score = model_gbc.score(xte, yte)
model_ada = AdaBoostClassifier(learning_rate=0.01)
model_ada.fit(xtr, ytr)
ada_score = model_ada.score(xte, yte)
model_rf = RandomForestClassifier(n_estimators=256)
model_rf.fit(xtr, ytr)
rf_score = model_rf.score(xte, yte)
avg_guess = (model_gbc.predict(xte) + model_ada.predict(xte) +
             model_rf.predict(xte)) / 3

avg_score = np.mean(
    [bayes_score, logreg_score, ridge_score, gbc_score, ada_score, rf_score])
예제 #44
0
for i, line in enumerate(data.splitlines()):
    line = line.split(",")
    X[i, :], y[i] = line[:4], line[4]

y -= (y == 0)

# In[13]:

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.4,
                                                    random_state=42)

clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print(f" score = {score} ")

plt.figure(1)
plt.rcParams["figure.figsize"] = [20, 10]
fig, axs = plt.subplots(1, 2)
axs[0].scatter(X_train[:, 0],
               X_train[:, 1],
               c=y_train,
               cmap=plt.cm.viridis,
               marker='o')
axs[0].set_title('Données d entraînement', fontsize=14)
axs[1].scatter(X_test[:, 0],
               X_test[:, 1],
               c=y_test,
               cmap=plt.cm.viridis,
    'VisitTeamstartupDef', 'VisitTeamstartupPer'
]]
trainY = file2016['WoL']
testX = testfile[[
    'HomeTeamBackupDef', 'HomeTeambackupPer', 'HomeTeamstartupDef',
    'HomeTeamstartupPer', 'VisitBackupDef', 'VisitTeambackupPer',
    'VisitTeamstartupDef', 'VisitTeamstartupPer'
]]
testY = testfile['WoL']
##plt.scatter(X[:,0],Y[:,0],marker='o',c=Y)
#bdt = AdaBoostClassifier(base_estimator=linear_model.LogisticRegression(),algorithm='SAMME',n_estimators=2000,learning_rate=0.02)

bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5,
                                                min_samples_split=5,
                                                min_samples_leaf=5),
                         algorithm='SAMME.R',
                         n_estimators=500,
                         learning_rate=0.2)

bdt.fit(trainX, trainY)
preictions = bdt.predict(testX)
print(bdt.estimator_weights_)
print("score : ", bdt.score(testX, testY))
print("\n", preictions)
'''
for i,row in testfile.iterrows():
    if (row['WoL'] == )
'''
confusion_matrix = confusion_matrix(testY, preictions)
print(confusion_matrix)
예제 #46
0
print('X type is {Xtype}'.format(Xtype=type(X)))

Xdf = pd.DataFrame(X)
print('Xdf head()---------------------------')
print(Xdf.head())
ydf = pd.DataFrame(y)
print('ydf head()---------------------------')
print(ydf.head())

clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X, y)
print clf.feature_importances_

print clf.predict([[0, 0, 0, 0]])

print clf.score(X, y)

dfclf = AdaBoostClassifier(n_estimators=100, random_state=0)
dfclf.fit(Xdf, ydf)
print dfclf.feature_importances_

# print dfclf.predict([[0, 0, 0, 0]])

print dfclf.predict_proba(Xdf)

print dfclf.predict(Xdf)

print dfclf.score(Xdf, ydf)

print ydf.head(10)
예제 #47
0
        yTest = workARR[test_index]

        print("start random forrest")
        if cnt < 2:
            randForrC.fit(trainX, yTrain)
            tmpSCR = randForrC.score(testX, yTest)
            scores['rand Forest'][label].append(tmpSCR)
        else:
            randForrR.fit(trainX, yTrain)
            tmpSCR = randForrR.score(testX, yTest)
            scores['rand Forest'][label].append(tmpSCR)

        print("start adaBoost")
        if cnt < 2:
            adaBoostC.fit(trainX, yTrain)
            tmpSCR = adaBoostC.score(testX, yTest)
            scores['adaBoost'][label].append(tmpSCR)
        else:
            adaBoostR.fit(trainX, yTrain)
            tmpSCR = adaBoostR.score(testX, yTest)
            scores['adaBoost'][label].append(tmpSCR)

        print("start bagging withOUT out-of-bag")
        if cnt < 2:
            bagCoobN.fit(trainX, yTrain)
            tmpSCR = bagCoobN.score(testX, yTest)
            scores['bagging (NO out of bag)'][label].append(tmpSCR)
        else:
            bagRoobN.fit(trainX, yTrain)
            tmpSCR = bagRoobN.score(testX, yTest)
            scores['bagging (NO out of bag)'][label].append(tmpSCR)
예제 #48
0
        placeToTakeForTest = int(random() * len(usedData))
        x = usedData.pop(placeToTakeForTest)
        y = usedValue.pop(placeToTakeForTest)
        testSetX.append(x)
        testSetY.append(y)

    for n_estimators in [1800]:

        usedData = np.array(usedData)
        usedValue = np.array(usedValue)
        testSetX = np.array(testSetX)
        testSetY = np.array(testSetY)

        clf = AdaBoostClassifier(n_estimators=n_estimators)
        clf = clf.fit(usedData, usedValue)
        value1 = clf.score(testSetX, testSetY)

        if value1 > best:
            best = value1
            nbBest = n_estimators

    print("best nb : ")
    print(nbBest)
    print(best)
    tot += best
    if nbBest in result:
        result[nbBest] += 1
    else:
        result[nbBest] = 1

print("\n")
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)



ab = AdaBoostClassifier()
ab.fit(x_train, y_train)

y_pred_train = ab.predict(x_train)
y_pred_test = ab.predict(x_test)

print("classifier", ab)
print ("Accuracy on Train Set")
print (ab.score(x_train, y_train))
print ("MLP Classifier")
print ("Accuracy on Test Set")
print (ab.score(x_test, y_test))
print ("Report")
print (classification_report(y_test,ab.predict(x_test)))


param_grid = {

    'n_estimators': [1000,2000,3000],
    'learning_rate':[1.0,5.0,10.0],
    'algorithm':['SAMME.R','SAMME']

}
예제 #50
0
if __name__ == '__main__':

    np.set_printoptions(edgeitems=5)
    trainingdata = sio.loadmat('mnist_dataset/mnist_train.mat')
    traininglabeldata = sio.loadmat('mnist_dataset/mnist_train_labels.mat')
    testdata = sio.loadmat('mnist_dataset/mnist_test.mat')
    testlabeldata = sio.loadmat('mnist_dataset/mnist_test_labels.mat')

    trainingImg = trainingdata["mnist_train"]
    trainingLabel = traininglabeldata["mnist_train_labels"]
    testImg = testdata["mnist_test"]
    testLabel = testlabeldata["mnist_test_labels"]

    trainingImg /= 255
    testImg /= 255
    num_TestData = testImg.shape[0]

    bdt = AdaBoostClassifier(DecisionTreeClassifier(criterion="entropy"))
    bdt.fit(trainingImg, trainingLabel.ravel())

    predict = bdt.predict(testImg).reshape(num_TestData, 1)
    score = bdt.score(testImg, testLabel)

    wrongCount = num_TestData - np.count_nonzero(predict == testLabel)

    with open('adaboost_result.csv', 'w') as file:
        file.write("Error Rate: %f\n" %(wrongCount / 10000))
        file.write("Score: %f\n" %(score))
        for i in range(0, num_TestData):
            file.write("%d %d\n" %(predict[i], testLabel[i]))
예제 #51
0
Xb_test = X_basic.iloc[train_inds:, :]
yb_test = y_basic.iloc[train_inds:, :]

# null accuracy rates for basic dataset (dict may not be great for this)
null_rates_basic = list(y_basic.mean())

##########################
# AdaBoost on basic data #
##########################
ADB_Scores = list(np.zeros(y_basic.shape[1]))

for i in range(y_basic.shape[1]):
    from sklearn.ensemble import AdaBoostClassifier
    adb = AdaBoostClassifier(n_estimators=100)
    adb.fit(Xb_train, yb_train.iloc[:, i])
    ADB_Scores[i] = adb.score(Xb_test, yb_test.iloc[:, i])

# compare to null accuracy rates; difference in accuracy:
for i in range(len(ADB_Scores)):
    print str(list(y_basic.mean())[i]) + '\t' + str(
        ADB_Scores[i]) + '\t' + str(ADB_Scores[i] - null_rates_basic[i])

###############################
# Random Forest on basic data #
###############################
RF_Scores = list(np.zeros(y_basic.shape[1]))

for i in range(y_basic.shape[1]):
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=20)
    rf.fit(Xb_train, yb_train.iloc[:, i])
예제 #52
0
data = pd.read_csv("spambase.data", sep=",", header=None)

Shuffle_data = shuffle(data)

inputs = Shuffle_data.iloc[:, :57]
labels = Shuffle_data.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(inputs,
                                                    labels,
                                                    test_size=0.3)

model_01 = MultinomialNB()
model_01.fit(x_train, y_train)

score_01 = model_01.score(x_test, y_test)

model_02 = AdaBoostClassifier()

model_02.fit(x_train, y_train)

score_02 = model_02.score(x_test, y_test)

print("Using NB : ", score_01, " Using AdaBoost : ", score_02)

prediction01 = model_01.predict(x_test)
prediction02 = model_02.predict(x_test)

for i in range(5):
    print("NB : ", prediction01[i], " Ada: ", prediction02[i], " Actual",
          list(y_test)[i])
예제 #53
0
# In[5]:

help(AdaBoostClassifier)

# In[6]:

#参数解释:
clf = AdaBoostClassifier(
    base_estimator=None,
    n_estimators=1000,
    learning_rate=0.01,
    algorithm='SAMME.R',
    random_state=2020,
)

t1 = time.time()
clf = clf.fit(x_train, y_train)
pred = clf.predict(x_test)
print('predict:', pred)
score = clf.score(x_test, y_test)
print('score:', score)
t2 = time.time()

# # 时间成本

# In[7]:

print('time:', t2 - t1)

# In[ ]:
예제 #54
0
X_train, X_test, y_train, y_test = train_test_split(tfidf,
                                                    label,
                                                    test_size=0.7,
                                                    random_state=75)

classifier_nb = MultinomialNB(class_prior=None,
                              fit_prior=False).fit(X_train, y_train)

filename = 'finalized_model.sav'
joblib.dump(classifier_nb, filename)

score = classifier_nb.score(X_test, y_test)

classifier_en = AdaBoostClassifier(n_estimators=100)
classifier_en.fit(tfidf, label)
score2 = classifier_en.score(X_test, y_test)
print "Score for Naive--- "
print score
print "For ADAboost is "
print score2

classifier_svm = SVC(probability=True, kernel='sigmoid')
classifier_svm.fit(X_test, y_test)
score3 = classifier_svm.score(X_test, y_test)

print "Score for SVC is--- "
print score3

sentiments = pd.DataFrame(columns=['text', 'class', 'prob'])

i = 0
예제 #55
0
                                                min_samples_split=20,
                                                min_samples_leaf=5),
                         algorithm="SAMME",
                         n_estimators=21,
                         learning_rate=0.8)

print(Ada)
epoch = 5
scores = []

# Train model
start = datetime.now()

for i in range(epoch):
    Ada.fit(feature_train_scaled, y_train_decode)
    scores.append(Ada.score(feature_test_scaled, y_test_decode))  #准确率

print("This took ", datetime.now() - start)
print(u'The accuracy of the model is: ')
display_scores(scores)  #准确率

params = {
    "learning_rate": uniform(0.1, 0.9),  # default 0.1
    "n_estimators": randint(10, 120)  # default 5
    #     "max_depth": randint(2, 6)            # default 3
}

search = RandomizedSearchCV(Ada,
                            param_distributions=params,
                            random_state=42,
                            n_iter=30,
예제 #56
0
from kullback_leibner_divergence_criterion import KullbackLeibnerCriterion
kldc = KullbackLeibnerCriterion(1, np.array([2], dtype='int64'))

#Create the tree
dt = DecisionTreeClassifier(max_depth=2, criterion=kldc)

# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(dt, algorithm="SAMME", n_estimators=200)
bdt.fit(X, y, w)

#from sklearn.ensemble import RandomForestClassifier
#bdt = RandomForestClassifier(criterion=kldc, max_depth=2, n_estimators=100)
#bdt.fit(X, y)

print('distance score: ', bdt.score(X, y))

plot_colors = "br"
plot_step = 0.02
class_names = "AB"

plt.figure(figsize=(10, 5))

# Plot the decision boundaries
plt.subplot(121)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                     np.arange(y_min, y_max, plot_step))

Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
예제 #57
0
print('Accuracy (a decision tree):', dt.score(test_data, test_labels))

#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(train_data, train_labels)

print('Accuracy (a random forest):', rfc.score(test_data, test_labels))

#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=1000, learning_rate=0.1)

abc.fit(train_data, train_labels)
print('Accuracy (adaboost with decision trees):', abc.score(test_data, test_labels))


###Own implementation of Bagging:

np.random.seed(1)

B = 1000
n = train_data.shape[0]
sn = int(n*2.0/3.0)   # nr of training data in subset for each tree
nf = train_data.shape[1]
all_preds = np.zeros((B,test_data.shape[0]))

for b in range(B):
    bs_sample_index = np.random.choice(range(n), size=sn, replace=True)
    subt = train_data[bs_sample_index,]
예제 #58
0
dtree.fit(x_train, y_train)
dtree.score(x_test, y_test)
# 0.625

dtree.score(x_train, y_train)
# 100%
# model is overfitting
################ Now With BaggingClassifier ############
bg = BaggingClassifier(DecisionTreeClassifier(),
                       max_samples=0.5,
                       max_features=1.0,
                       n_estimators=20)
bg.fit(x_train, y_train)  #fitting the model
bg.score(x_test, y_test)
# 66.25
bg.score(x_train, y_train)
# 94.375
# model is Underfitting
################ Now With BoostingClassifier ###########
ada = AdaBoostClassifier(DecisionTreeClassifier(),
                         n_estimators=10,
                         learning_rate=1)
ada.fit(x_train, y_train)
ada.score(x_test, y_test)

ada.score(x_train, y_train)
# 100%
# model is overfitting

# By looking at all the above model we can tell that "Bagging model" is gives the good result
예제 #59
0
    score_time = time() - start  # get the score time
    print("{:<15}| score = {:.3f} | time = {:,.3f}s/{:,.3f}s".format(
        name, score, train_time, score_time))

# Fitting AdaBoost Classification to the Training set
classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=7,
                                                       min_samples_split=20,
                                                       min_samples_leaf=5),
                                algorithm="SAMME",
                                n_estimators=200,
                                learning_rate=0.8)
start = time()
classifier.fit(X_train, y_train)
train_time = time() - start
start = time()
score = classifier.score(X_test, y_test)
score_time = time() - start
print("score = {:.3f} | time = {:,.3f}s/{:,.3f}s".format(
    score, train_time, score_time))

# Calculating feature inportance
feature_name = cv.get_feature_names()
feature_name = np.array(feature_name)
feature_name = np.insert(feature_name, 0, "avg_star_rating", axis=0)
importances = classifier.feature_importances_
indices = np.argsort(importances)[::-1]
feature_name = np.array(feature_name)
for f in range(100):
    print("%2d) %-*s %f" %
          (f + 1, 30, feature_name[indices[f]], importances[indices[f]]))
# 1) dryer                          0.155159
    testSetX1 = np.array(testSetX1)
    testSetY1 = np.array(testSetY1)

    scaler1 = StandardScaler()
    scaler1.fit(usedData)
    usedDataScale2 = scaler1.transform(usedData)
    testSetXScale2 = scaler1.transform(testSetX)

    clf1 = AdaBoostClassifier(n_estimators=200)
    clf1 = clf1.fit(usedDataScale2, usedValue)
    #print("\n importance 1:")

    print(clf1.predict_proba(testSetXScale2))
    print(clf1.predict(testSetXScale2))
    value1 = clf1.predict(testSetXScale2)
    value1Score = clf1.score(testSetXScale2, testSetY)

    clf2 = MLPClassifier()
    clf2 = clf2.fit(usedData, usedValue)
    #print("\n importance 2:")
    print(clf2.predict(testSetX))
    print(clf2.predict_proba(testSetX))
    value2 = clf2.predict(testSetX)
    value2Score = clf2.score(testSetX, testSetY)

    clf3 = KNeighborsClassifier(n_neighbors=nbNeighbors, weights=weightValue)
    clf3 = clf3.fit(usedData1, usedValue1)
    #print("\n importance 3:")
    #print(clf3.predict_proba(testSetX1))
    print(clf3.predict(testSetX1))
    value3 = clf3.predict(testSetX1)