예제 #1
1
class Ensemble:

	def __init__(self, data):
		self.rf = RandomForestClassifier(n_estimators=80, n_jobs=-1, min_samples_split=45, criterion='entropy')
		self.lda = LDA()
		self.dec = DecisionTreeClassifier(criterion='entropy')
		self.ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.25)

		self.make_prediction(data)


	def make_prediction(self, data):
		'''
		Make an ensemble prediction
		'''
		self.rf.fit(data.features_train, data.labels_train)
		self.lda.fit(data.features_train, data.labels_train)
		self.dec.fit(data.features_train, data.labels_train)
		self.ada.fit(data.features_train, data.labels_train)

		pre_pred = []
		self.pred = []

		ada_pred = self.ada.predict(data.features_test)
		rf_pred = self.rf.predict(data.features_test)
		lda_pred = self.lda.predict(data.features_test)
		dec_pred = self.dec.predict(data.features_test)

		for i in range(len(rf_pred)):
			pre_pred.append([ rf_pred[i], lda_pred[i], dec_pred[i], ada_pred[i] ])

		for entry in pre_pred:
			pred_list = sorted(entry, key=entry.count, reverse=True)
			self.pred.append(pred_list[0])
예제 #2
0
def boost_report():
  svm_train_features = list()
  svm_train_classes = list()
  svm_test_features = list()
  svm_test_classes = list()

  for record in mit_records:
    svm_train_features.append(list(record.features.values()))
    svm_train_classes.append(record.my_class)
  for record in mim_records:
    svm_test_features.append(list(record.features.values()))
    svm_test_classes.append(record.my_class)

  svm_classifier = svm.SVC(kernel="linear", C=0.1)
  svm_classifier.fit(svm_train_features, svm_train_classes)
  print("linear kernel svm accuracy: " +
        str(svm_classifier.score(svm_test_features, svm_test_classes)))

  classifier = AdaBoostClassifier(
    base_estimator=svm_classifier,
    n_estimators=100,
    algorithm='SAMME')
  classifier.fit(svm_train_features, svm_train_classes)
  print("adaboost accuracy: " +
        str(classifier.score(svm_test_features, svm_test_classes)))
def training(baseclassparameters, adaparameters, queue):
    treeclassifier = DecisionTreeClassifier(**baseclassparameters)
    adaclassifier = AdaBoostClassifier(treeclassifier, **adaparameters)

    print "\nBegin calculation with {0} and {1}".format(str(baseclassparameters), str(adaparameters))
    adaclassifier.fit(Xtrain, ytrain)

    #Predict with the model
    prob_predict_test = adaclassifier.predict_proba(Xtest)[:,1]

    #Calculate maximal significance
    True_Signal_test = prob_predict_test[ytest==1]
    True_Bkg_test = prob_predict_test[ytest==0]
    best_significance = 0
    for x in np.linspace(0, 1, 1000):
        S = float(len(True_Signal_test[True_Signal_test>x]))
        B = float(len(True_Bkg_test[True_Bkg_test>x]))

        significance = S/np.sqrt(S+B)
        if significance > best_significance:
            best_significance = significance
            best_x = x
            best_S = S
            best_B = B

    print "\nCalculation with {} and {} done ".format(str(baseclassparameters), str(adaparameters))
    print "Best significance of {0:.2f} archived when cutting at {1:.3f}".format(best_significance, best_x)
    print "Signal efficiency: {0:.2f}%".format(100.*best_S/len(True_Signal_test))
    print "Background efficiency: {0:.2f}%".format(100.*best_B/len(True_Bkg_test))
    print "Purity: {0:.2f}%".format(100.*best_S/(best_S+best_B))

    queue.put( (best_significance, baseclassparameters, adaparameters) )
예제 #4
0
def cvalidate():
    from sklearn import cross_validation

    trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:]
    X = np.array([x[1:8] for x in trainset])
    y = np.array([x[8] for x in trainset])
    #print X,y
    import math
    for i, x in enumerate(X):
        for j, xx in enumerate(x):
            if(math.isnan(xx)):
                X[i][j] = 26.6
   
    #print X[0:3]
    #print y[0:3]
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0)

    X_train, X_test = decomposition_pca(X_train, X_test)
    
    bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200)
    bdt.fit(X_train, y_train)
    
    

    print bdt.score(X_test, y_test)
예제 #5
0
def experiment_estimators_AdaBoostRandomForest():
    avgError = []
    x_learners = []
    rf = RandomForestClassifier(n_estimators=maxLearners, max_depth = maxDepth, warm_start = False)
    for k_estimators in range(10,150,10):
        k = 10
        skf = StratifiedKFold(labels,n_folds=k)
        averageError = 0.0
        for train_index, test_index in skf:
            X_train, X_test = mfcc[:,train_index], mfcc[:,test_index]
            y_train, y_test = labels[train_index], labels[test_index]
            adb = AdaBoostClassifier(base_estimator=rf, n_estimators=k_estimators, learning_rate=0.01)
            adb.fit(X_train.T,y_train)
            y_pred = adb.predict(X_test.T)
            error = zero_one_loss(y_pred,y_test)
            print error
            averageError += (1./k) * error
        print "Average error: %4.2f%s" % (100 * averageError,'%')
        avgError.append(averageError)
        x_learners.append(k_estimators)
    # graph the errors now.
    plt.plot(x_learners, avgError)
    plt.ylabel('Average Error (k=10)')
    plt.xlabel('Number of Estimators')
    plt.title('Error as a function of the number of estimators')
    plt.show()
예제 #6
0
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)
    corpus = vector.transform(corpus)

    # Get the sample from the dataset
    sample = corpus.sample(False, 0.1).collect()
    X = [row['tfidf'] for row in sample]
    y = [row['label'] for row in sample]

    # Train a Scikit-Learn Model
    clf = AdaBoostClassifier()
    clf.fit(X, y)

    # Broadcast the Scikit-Learn Model to the cluster
    clf = sc.broadcast(clf)

    # Create accumulators for correct vs incorrect
    correct = sc.accumulator(0)
    incorrect = sc.accumulator(1)

    # Create the accuracy closure
    accuracy = make_accuracy_closure(clf, incorrect, correct)

    # Compute the number incorrect and correct
    corpus.foreachPartition(accuracy)

    accuracy = float(correct.value) / float(correct.value + incorrect.value)
    print("Global accuracy of model was {}".format(accuracy))
def test_iris():
    # Check consistency on dataset iris.
    classes = np.unique(iris.target)
    clf_samme = prob_samme = None

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)
        clf.fit(iris.data, iris.target)

        assert_array_equal(classes, clf.classes_)
        proba = clf.predict_proba(iris.data)
        if alg == "SAMME":
            clf_samme = clf
            prob_samme = proba
        assert_equal(proba.shape[1], len(classes))
        assert_equal(clf.decision_function(iris.data).shape[1], len(classes))

        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
            (alg, score)

    # Somewhat hacky regression test: prior to
    # ae7adc880d624615a34bafdb1d75ef67051b8200,
    # predict_proba returned SAMME.R values for SAMME.
    clf_samme.algorithm = "SAMME.R"
    assert_array_less(0,
                      np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
예제 #8
0
def main():

    trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:]
    X = np.array([x[1:8] for x in trainset])
    y = np.array([x[8] for x in trainset])
    #print X,y
    import math
    for i, x in enumerate(X):
        for j, xx in enumerate(x):
            if(math.isnan(xx)):
                X[i][j] = 26.6
   
    
    testset = np.genfromtxt(open('test.csv','r'), delimiter = ',')[1:]

    test = np.array([x[1:8] for x in testset])
    for i, x in enumerate(test):
        for j, xx in enumerate(x):
            if(math.isnan(xx)):
                test[i][j] = 26.6
   

    X, test = decomposition_pca(X, test)

    bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200)
    bdt.fit(X, y)
    


    print 'PassengerId,Survived'
    for i, t in enumerate(test):
        print '%d,%d' % (i + 892, int(bdt.predict(t)[0]))
예제 #9
0
def test_staged_predict():
    """Check staged predictions."""
    # AdaBoost classification
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, n_estimators=10)
        clf.fit(iris.data, iris.target)

        predictions = clf.predict(iris.data)
        staged_predictions = [p for p in clf.staged_predict(iris.data)]
        proba = clf.predict_proba(iris.data)
        staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
        score = clf.score(iris.data, iris.target)
        staged_scores = [s for s in clf.staged_score(iris.data, iris.target)]

        assert_equal(len(staged_predictions), 10)
        assert_array_almost_equal(predictions, staged_predictions[-1])
        assert_equal(len(staged_probas), 10)
        assert_array_almost_equal(proba, staged_probas[-1])
        assert_equal(len(staged_scores), 10)
        assert_array_almost_equal(score, staged_scores[-1])

    # AdaBoost regression
    clf = AdaBoostRegressor(n_estimators=10)
    clf.fit(boston.data, boston.target)

    predictions = clf.predict(boston.data)
    staged_predictions = [p for p in clf.staged_predict(boston.data)]
    score = clf.score(boston.data, boston.target)
    staged_scores = [s for s in clf.staged_score(boston.data, boston.target)]

    assert_equal(len(staged_predictions), 10)
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert_equal(len(staged_scores), 10)
    assert_array_almost_equal(score, staged_scores[-1])
def test_pickle():
    # Check pickability.
    import pickle

    # Adaboost classifier
    for alg in ['SAMME', 'SAMME.R']:
        obj = AdaBoostClassifier(algorithm=alg)
        obj.fit(iris.data, iris.target)
        score = obj.score(iris.data, iris.target)
        s = pickle.dumps(obj)

        obj2 = pickle.loads(s)
        assert_equal(type(obj2), obj.__class__)
        score2 = obj2.score(iris.data, iris.target)
        assert_equal(score, score2)

    # Adaboost regressor
    obj = AdaBoostRegressor(random_state=0)
    obj.fit(boston.data, boston.target)
    score = obj.score(boston.data, boston.target)
    s = pickle.dumps(obj)

    obj2 = pickle.loads(s)
    assert_equal(type(obj2), obj.__class__)
    score2 = obj2.score(boston.data, boston.target)
    assert_equal(score, score2)
예제 #11
0
def adaBoost(n,x,t,x_test,t_test):
    clf = AdaBoostClassifier(n_estimators = n)
    clf.fit(x, t)
    predictions = clf.predict(x_test)
    X = confusion_matrix(t_test,predictions)
    classificationRate = (X[1,1]+X[0,0]) / sum(sum(X))
    return(1-classificationRate)
예제 #12
0
파일: cook.py 프로젝트: wangchr/eMeriL
def cook():
    x, y, weights = load_data()
    n_components = 200
    svd = TruncatedSVD(n_components, random_state=42)
    x_unweighted = svd.fit_transform(x)
    x_weighted = svd.fit_transform(weighted(x, weights))

    for i in range(9):
        frac = 1 - (i * 0.01 + 0.01)
        print frac

        x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Unweighted: ", classifier.score(x_test, y_test)

        x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Weighted: ", classifier.score(x_test, y_test)

        print '--------------------------'


    '''
def train_classifiers(X_data, y_data):
    ############ Linear SVM: 0.908 #############
    clf_LSVM = svm.SVC(kernel = 'linear')
    clf_LSVM.fit(X_data, y_data)
    
    ############ MultinomialNB: 0.875 #############
    clf_MNB = MultinomialNB()
    clf_MNB.fit(X_data, y_data)
    
    ############ Random Forest: 0.910 #############
    clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy')
    clf_RF.fit(X_data, y_data)
    
    ############ Extra Tree: 0.915 ##################
    clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0)
    clf_ETC.fit(X_data, y_data)
    
    ############ AdaBoost: 0.88 ##################
    clf_Ada = AdaBoostClassifier()
    clf_Ada.fit(X_data, y_data)
    
    ############ rbf SVM: 0.895 #############
    clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf')
    clf_rbf.fit(X_data, y_data)
    
    ############ GradientBoosting: 0.88 #############
    clf_GBC = GradientBoostingClassifier()
    clf_GBC.fit(X_data, y_data)
    
    return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC    
def createAdaBoostClassifier(trainingVectors, targetValues):
    

    clf = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
    clf.fit(trainingVectors, targetValues, targetValues*10000)
    
    return(clf)
예제 #15
0
class DomainTypeClassifier(object):
    def __init__(self, radius, window_mode=False):
        self.classifier = AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=2),
            n_estimators=20,
            learning_rate=1,
            algorithm="SAMME")
        # svm.SVC(kernel='rbf')
        self.radius = radius
        self.window_mode = window_mode

    def train(self, dataset):
        k = self.radius if not self.window_mode else 2 * self.radius + 1
        rin, rout = dataset.getData(k, self.window_mode)
        print("fitting", len(rin))
        self.classifier.fit(np.asarray(rin, float), np.asarray(rout, float))

    def predict(self, ns):
        k = self.radius if not self.window_mode else 2 * self.radius + 1
        to_predict = []
        for i in range(len(ns)):
            if not self.window_mode:
                to_predict.append(encode(create_region(ns, i, k)))
            else:
                if i > len(ns) - k:
                    break
                to_predict.append(encode(ns[i:i+k]))
        return int(Counter(self.classifier.predict(
            np.asarray(to_predict, float))).most_common(1)[0][0])
예제 #16
0
def cvalidate():
    targetset = np.genfromtxt(open('trainLabels.csv','r'), dtype='f16')
    y = [x for x in targetset]

    trainset = np.genfromtxt(open('train.csv','r'), delimiter=',', dtype='f16')
    X = np.array([x for x in trainset])
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0)

    X_train, X_test = decomposition_pca(X_train, X_test)

    #SVM

    c_range = 10.0 ** np.arange(6.5,7.5,.25)
    gamma_range = 10.0 ** np.arange(-2.5,0.5,.25)
    parameters = {'kernel':['rbf'], 'C':c_range,  'gamma':gamma_range} 
    svr = SVC()

    clf = grid_search.GridSearchCV(svr, parameters)
    

    clf.fit(X_train, y_train)
    bdt = AdaBoostClassifier(base_estimator = clf.best_estimator_,
                         algorithm="SAMME",
                         n_estimators=100)

    
    #bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=10))
    bdt.fit(X_train, y_train)
    

    print bdt.score(X_test, y_test)
예제 #17
0
def ANGEL_training(cds_filename, utr_filename, output_pickle, num_workers=3):
    coding = [ r for r in SeqIO.parse(open(cds_filename), 'fasta') ]
    utr = [ r for r in SeqIO.parse(open(utr_filename), 'fasta') ]

    o_all = c_ORFscores.CDSWindowFeat()
    add_to_background(o_all, coding)
    add_to_background(o_all, utr)

    data_pos = get_data_parallel(o_all, coding, [0], num_workers)
    data_neg = get_data_parallel(o_all, utr, [0, 1, 2], num_workers)

    data = data_neg + data_pos
    target = [0]*len(data_neg) + [1]*len(data_pos)
    data = np.array(data)

    print >> sys.stderr, "data prep done, running classifier...."
    bdt = AdaBoostClassifier(n_estimators=50)
    bdt.fit(data, target)

    print >> sys.stderr, "classifier trained. putting pickle to", output_pickle

    with open(output_pickle, 'wb') as f:
        dump({'bdt':bdt, 'o_all':o_all}, f)

    return data, target, bdt
예제 #18
0
class AdaBoostcls(object):
    """docstring for ClassName"""
    def __init__(self):
        self.adaboost_cls = AdaBoostClassifier()
        self.prediction = None
        self.train_x = None
        self.train_y = None

    def train_model(self, train_x, train_y):
        try:
            self.train_x = train_x
            self.train_y = train_y
            self.adaboost_cls.fit(train_x, train_y)
        except:
            print(traceback.format_exc())

    def predict(self, test_x):
        try:
            self.test_x = test_x
            self.prediction = self.adaboost_cls.predict(test_x)
            return self.prediction
        except:
            print(traceback.format_exc())

    def accuracy_score(self, test_y):
        try:
            # return r2_score(test_y, self.prediction)
            return self.adaboost_cls.score(self.test_x, test_y)
        except:
            print(traceback.format_exc())
예제 #19
0
def some(X, Y, X_test, Y_test):
    ada = AdaBoostClassifier()
    print "Train Model ---"
    t1 = time()
    ada.fit(X, Y)
    t2 = time()
    print "Model Trained ----------", t2 - t1
    test_errors = []
    cur = 1
    Y_test2 = []
    for k in Y_test:
        Y_test2.append(k[0])
    print "Testing: "
    print  Y_test2
    pred =  ada.predict(X_test)
    print pred
    accu =  1. - accuracy_score(y_true= Y_test2, y_pred= pred)
    print accu
    print "STAGED _____________"
    for test_predict in (
        ada.staged_predict(X_test)):


            test_errors.append(
            1. - accuracy_score(test_predict, Y_test2))


    print  "errorss : "
    print test_errors
예제 #20
0
def do_all_study(X,y):
    
    names = [ "Decision Tree","Gradient Boosting",
             "Random Forest", "AdaBoost", "Naive Bayes"]

    classifiers = [
        #SVC(),
        DecisionTreeClassifier(max_depth=10),
        GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1),
        RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1),
        AdaBoostClassifier()]
    for name, clf in zip(names, classifiers):
        estimator,score = plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')


    clf_GBC = GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1)
    param_name = 'n_estimators'
    param_range = [1, 5, 10, 20,40]

    plot_validation_curve(clf_GBC, X_train, y_train,
                          param_name, param_range, scoring='roc_auc')
    clf_GBC.fit(X_train,y_train)
    y_pred_GBC = clf_GBC.predict_proba(X_test)[:,1]
    print("ROC AUC GradientBoostingClassifier: %0.4f" % roc_auc_score(y_test, y_pred_GBC))

    clf_AB = AdaBoostClassifier()
    param_name = 'n_estimators'
    param_range = [1, 5, 10, 20,40]

    plot_validation_curve(clf_AB, X_train, y_train,
                          param_name, param_range, scoring='roc_auc')
    clf_AB.fit(X_train,y_train)
    y_pred_AB = clf_AB.predict_proba(X_test)[:,1]
    print("ROC AUC AdaBoost: %0.4f" % roc_auc_score(y_test, y_pred_AB))
예제 #21
0
def ada(xtrain, ytrain, train_weight, tests, test_weight):
    #Initiate the training model
    clf = AdaBoostClassifier()
    mistakes = 0
    cost = 0
    #Fit the model
    clf.fit(xtrain, ytrain)
    vector_count = 0
    #Iterate over the tests
    for i in range(len(tests)):
        #Get the number of elements in each test
        vector_count += len(tests[i])
        test_count = 0
        #Iterate over each feature in the tests
        for vector in tests[i]:
            #Predict based on each feature
            prediction = clf.predict(vector)
            #Determine the cost
            cost += test_weight[i][test_count] * pen[i][prediction[0]]
            #Count the number of mistakes
            if pen[i][prediction[0]] > 0:
                #print("Incorrectly Predicted " + str(Segments.reverse_mapping[i]) + " as " + str(Segments.reverse_mapping[prediction[0]]))
                mistakes += 1
                test_count += 1

    print("Number of mistakes: " + str(mistakes) + " of " + \
            str(vector_count) + ", " + \
            str((1.-float(mistakes)/float(vector_count))*100) + \
            "% accurate")

    return cost
예제 #22
0
def ada_boost_dt():
    """
    Submission: ada_boost_dt_0707_03.csv
    E_val: 0.854350
    E_in: 0.889561
    E_out: 0.8832315976033993
    """
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import cross_val_score
    from sklearn.pipeline import Pipeline

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    ab = AdaBoostClassifier(n_estimators=300)

    scores = cross_val_score(ab, X_scaled, y, cv=5, n_jobs=-1)
    logger.debug('CV: %s', scores)
    logger.debug('E_val: %f', sum(scores) / len(scores))

    ab.fit(X_scaled, y)

    logger.debug('E_in: %f', Util.auc_score(ab, X_scaled, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('ab', ab)]), 'ada_boost_dt_0707_03')
예제 #23
0
def prediction(feat,label):
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(feat, label, test_size = 0.25, random_state = 0)
    num_leaves = []
    accuracy_score = []
    auc_score = []
    # for depth in range(1,10):
    #     clf = tree.DecisionTreeClassifier(max_depth = depth)
    #     clf.fit(x_train,y_train)
    #     predictions = clf.predict(x_test)
    #     accuracy = clf.score(x_test,y_test)
    #     auc = metrics.roc_auc_score(y_test,predictions)
    #     num_leaves.append(depth)
    #     accuracy_score.append(accuracy)
    #     auc_score.append(auc)

    for depth in range(1,10):
        clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth = depth), n_estimators = 100)
        clf.fit(x_train,y_train)
        predictions = clf.predict(x_test)
        accuracy = clf.score(x_test,y_test)
        auc = metrics.roc_auc_score(y_test,predictions)
        num_leaves.append(depth)
        accuracy_score.append(accuracy)
        auc_score.append(auc)


    return num_leaves,accuracy_score,auc_score
예제 #24
0
def ab_predictedValue():
    print '----------AdaBoost----------'
    ab_clf = AdaBoostClassifier(n_estimators = NoOfEstimators)
    ab_clf.fit(train_df[features], train_df['SeriousDlqin2yrs'])
    ab_predictedValue = ab_clf.predict_proba(test_df[features])
    print 'Feature Importance = %s' % ab_clf.feature_importances_
    return ab_predictedValue[:,1]
def AB_results(): # AdaBoostClassifier
	print "--------------AdaBoostClassifier-----------------"
	rang = [60, 80]
	
	# print "--------------With HOG-----------------"
	# ans = []
	# print "n_estimators	Accuracy"
	# for i in rang:
	# 	clf = AdaBoostClassifier(n_estimators=i)
	# 	clf.fit(X_train_hog, y_train)
	# 	mean_accuracy = clf.score(X_test_hog, y_test)
	# 	print i, "	", mean_accuracy
	# 	ans.append('('+str(i)+", "+str(mean_accuracy)+')')
	# print ans

	# plt.plot(rang, ans, linewidth=2.0)
	# plt.xlabel("n_estimators")
	# plt.ylabel("mean_accuracy")
	# plt.savefig("temp_hog.png")

	
	print "\n--------------Without HOG-----------------"
	ans = []
	print "n_estimators	Accuracy"
	for i in rang:
		clf = AdaBoostClassifier(n_estimators=i)
		clf.fit(X_train, y_train)
		mean_accuracy = clf.score(X_test, y_test)
		print i, "	", mean_accuracy
		ans.append('('+str(i)+", "+str(mean_accuracy)+')')
	print ans
	plt.plot(rang, ans, linewidth=2.0)
	plt.xlabel("n_estimators")
	plt.ylabel("mean_accuracy")
	plt.savefig("temp_plain.png")
예제 #26
0
def ADA_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting  AdaBoost Classifier***************")
    t0 = time()
    clf = AdaBoostClassifier(n_estimators=300)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("AdaBoost Classifier - {0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)
    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)

    print("***************Ending AdaBoost Classifier***************")
    return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
예제 #27
0
def runAdaBoost(arr):#depth, n_est,  lrn_rate=1.0): # removing filename for the scipy optimise thing '''filename,'''
    #ada = AdaBoostClassifier(n_estimators=100)
    global file_dir, nEvents, solutionFile, counter
    print 'iteration number ' + str(counter)
    counter+=1
    depth = int(arr[0]*100)
    n_est = int(arr[1]*100)
    lrn_rate = arr[2]
    if depth <= 0 or n_est <= 0 or lrn_rate <= 0:
        return 100

    fname = 'ada_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate)
    filename = fname
    ada = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=depth),
                             algorithm="SAMME",
                             n_estimators=n_est)#,n_jobs=4)
    print "AdaBoost training"
    ada.fit(sigtr[train_input].values,sigtr['Label'].values)
    print "AdaBoost testing"
    ada_pred = ada.predict(sigtest[train_input].values)
    solnFile(filename,ada_pred,sigtest['EventId'].values)#
    print "AdaBoost finished"
    # added for teh scipy optimise thing
    ams_score = ams.AMS_metric(solutionFile, file_dir+fname+'.out', nEvents)
    print ams_score
    logfile.write(fname + ': ' + str(ams_score)+'\n')
    return -1.0*float(ams_score) # since we are minimising
예제 #28
0
def runAdaReal(arr):#depth, n_est, filename, lrn_rate=1.0):
    global file_dir, nEvents, solutionFile, counter
    depth = int(arr[0]*100)
    n_est = int(arr[1]*100)
    lrn_rate = arr[2]
    print 'iteration number ' + str(counter)
    counter+=1
    if depth <= 0 or n_est <= 0 or lrn_rate <= 0:
        print 'return 100'
        return 100
    filename =  'adar_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate) # low
    bdt_real = AdaBoostClassifier(
        tree.DecisionTreeClassifier(max_depth=depth),
        n_estimators=n_est,
        learning_rate=lrn_rate)
    print "AdaBoostReal training"
    bdt_real.fit(sigtr[train_input].values,sigtr['Label'].values)
    print "AdaBoostReal testing"
    bdt_real_pred = bdt_real.predict(sigtest[train_input].values)
    solnFile(filename,bdt_real_pred,sigtest['EventId'].values)#
    print "AdaBoostReal finished"
    ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents)
    print ams_score
    logfile.write(filename+': ' + str(ams_score)+'\n')
    return -1.0*float(ams_score)
예제 #29
0
파일: q5.py 프로젝트: oryband/homework
def adaboost_skin(X_train, y_train, X_test, y_test):
    """Learn the skin data sets with AdaBoost.

    X_*: Samples.
    y_*: labels.
    """
    print 'AdaBoost'

    min_iter = 1
    max_iter = 200
    steps = 30
    diff = (max_iter - min_iter) / steps
    iterations = [min_iter + diff * step for step in xrange(steps+1)]
    scores = []
    for T in iterations:

        clf = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=1),
            algorithm="SAMME",
            n_estimators=T)

        clf.fit(X_train.toarray(), y_train)
        scores.append(100 * clf.score(X_test.toarray(), y_test))

        print '\t%d Iterations: %.2f%%' % (T, scores[-1])

    return iterations, scores
예제 #30
0
class Model_Adaboost(object):
    def __init__(self,model,parameter = {"n_estimators" : 50, "CV_size": 0}):
        self.train = model.train
        self.test = model.test
        self.CVsize = float(parameter["CV_size"].get())
        train = np.array(self.train)
        self.X_train = train[:, :-1]
        self.y_train = train[:, -1]
        self.X_train,self.X_CV,self.y_train,self.y_CV = train_test_split(self.X_train, self.y_train, test_size=self.CVsize)
        if self.CVsize == 0:
            self.clf = AdaBoostClassifier(n_estimators = int(parameter["n_estimators"].get()))
        self.model = model

    def fit(self):
        self.clf.fit(self.X_train,self.y_train)

    def score(self):
        pre = self.clf.predict(self.X_train)
        truth = self.y_train
        print ("score: " + str(self.clf.score(self.X_train,truth)))
        print ("f1: " + str(f1_score(truth,pre, average=None)))
        print ("AUC score: " + str(roc_auc_score(truth,pre)))

    def save_results(self):
        pre = self.model.clf.predict(self.model.test)
        df = pd.DataFrame({"predict":pre})
        fileName = tkFileDialog.asksaveasfilename()
        df.to_csv(fileName)

    def crossValidation(self):
        estimatorList = [3,5,7,10,13,15,20,25,30,50]
        bestScore = [0,0] #score,n_estimator
        bestF1ScoreNeg = [0,0]
        bestF1ScorePos = [0,0]
        #bestAUCScore = [0,0]
        for e in estimatorList:
            self.clf = AdaBoostClassifier(n_estimators = e)
            self.clf.fit(self.X_train,self.y_train)
            pre = self.clf.predict(self.X_CV)
            truth = self.y_CV
            score = self.clf.score(self.X_CV,truth)
            if score > bestScore[0]:
                bestScore[0] = score
                bestScore[1] = e

            f1pos = f1_score(truth,pre, average=None)[1]
            if f1pos > bestF1ScorePos[0]:
                bestF1ScorePos[0] = f1pos
                bestF1ScorePos[1] = e

            f1neg = f1_score(truth,pre, average=None)[0]
            if f1neg > bestF1ScoreNeg[0]:
                bestF1ScoreNeg[0] = f1neg
                bestF1ScoreNeg[1] = e

        print ("Adaboost:")
        print ("Best [score,n_estimators] on Cross Validation set: " + str(bestScore))
        print ("Best [f1(pos),n_estimators] on Cross Validation set: " + str(bestF1ScorePos))
        print ("Best [f1(neg),n_estimators] on Cross Validation set" + str(bestF1ScoreNeg))
 def selectAdaBoostClassifier(self, fit=0):
     clf = AdaBoostClassifier(n_estimators=self.estimatorSize)
     #scores = cross_val_score(clf, self.sparse_matrix, self.training_labels_list)
     #print "AdaBoostClassifier  -  %s" % scores.mean()
     clf.fit(self.sparse_matrix, self.training_labels_list)
     self.classify_test_data(clf, 'AdaBoost_%s' % (self.estimatorSize))
예제 #32
0
                rfModel = RandomForestClassifier(n_estimators=300,
                                                 max_depth=12,
                                                 random_state=0)
                rfModel.fit(np.array(pcaTrainData), np.array(trainLabels))
                predictionsMade = rfModel.predict(
                    np.array(pcaTestData)).tolist()

                tempAcc = met.accuracy(predictionsMade, testLabels)
                #tempF1 = met.precision_score(predictionsMade, testLabels)
                tempF1 = f1_score(testLabels, predictionsMade, average=None)
                rfAccuracy += tempAcc
                rfF1Score += tempF1

            if (useADABoost):
                adaModel = AdaBoostClassifier(n_estimators=300)
                adaModel.fit(np.array(pcaTrainData), np.array(trainLabels))
                predictionsMade = adaModel.predict(
                    np.array(pcaTestData)).tolist()

                tempAcc = met.accuracy(predictionsMade, testLabels)
                #tempF1 = met.precision_score(predictionsMade, testLabels)
                tempF1 = f1_score(testLabels, predictionsMade, average=None)
                adaAccuracy += tempAcc
                adaF1Score += tempF1

            if (useGradBoost):
                gradBoostModel = GradientBoostingClassifier(n_estimators=240,
                                                            learning_rate=0.5,
                                                            max_depth=21)
                gradBoostModel.fit(np.array(pcaTrainData),
                                   np.array(trainLabels))
예제 #33
0
y_test = y_test.replace("No", 0)
y_test = y_test.replace("Yes", 1)
print(X_train["TotalCharges"].dtypes)
print(X_test["TotalCharges"].dtypes)

# --------------
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Code starts here
print(X_train.head())
print(X_test.head())
print(y_train.head())
print(y_test.head())
ada_model = AdaBoostClassifier(random_state=0)
ada_model.fit(X_train, y_train)
y_pred = ada_model.predict(X_test)
ada_score = accuracy_score(y_pred, y_test)
print(ada_score)
ada_cm = confusion_matrix(y_pred, y_test)
print(ada_cm)
ada_cr = classification_report(y_pred, y_test)
print(ada_cr)

# --------------
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

#Parameter list
parameters = {
    'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
plt.show()

#BOOSTING ALGORITHMS - ADABOOST
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)
# Fit ada to the training set
ada.fit(X_train, y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba = ada.predict_proba(X_test)[:, 1]

# Import roc_auc_score
from sklearn.metrics import roc_auc_score

# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))

#GRADIENT BOOSTING ENSEMBLE
# Import GradientBoostingRegressor
x_train, x_test, y_train, y_test = train_test_split(predictors_df,target_df, test_size = 0.2,random_state=7)
                                            

#decision tree
dt = DecisionTreeClassifier() #storing the classifer in dt

dt.fit(x_train, y_train) #fitting te model 

dt.score(x_test, y_test) #checking the score like accuracy

dt.score(x_train, y_train)
#so our model is overfitting 

                                      # Ada boosting 
ada = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=10, learning_rate=7)
ada.fit(x_train,y_train)

ada.score(x_test,y_test)

ada.score(x_train,y_train)
       

                                   #building  voting model 
                                   
# Splitting data into training and testing data set

x_train, x_test, y_train, y_test = train_test_split(predictors_df,target_df, test_size = 0.2,random_state=7)
                                          
from sklearn.ensemble import VotingClassifier
# Voting Classifier 
from sklearn.linear_model import LogisticRegression # importing logistc regression
예제 #36
0
    # per la stessa feature, normalizzo i dati del dataset di training.csv
    singola_feature_training = pd.DataFrame(
        label_encoder.transform(singola_feature_training))
    dataframe_training = pd.concat(
        [dataframe_training, singola_feature_training], axis=1)
#
classi_target = pd.read_csv(
    r'C:\\Users\\FauxL\\Desktop\\Data Science\\Project\\training.csv',
    usecols=['Name'])

classificatore = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                                    n_estimators=150,
                                    algorithm="SAMME.R",
                                    learning_rate=0.5)
classificatore.fit(
    dataframe_training,
    classi_target.values.ravel())  #fitto con adaboost tutto dataset training

predict = classificatore.predict(dataframe_training)

with open('resultAda.csv', 'w') as csvFile:
    writer = csv.writer(csvFile, delimiter=' ')
    writer.writerows(predict)
    dfAda = predict
csvFile.close()

uniciAda, counteggioAda = np.unique(dfAda, return_counts=True)

print(uniciAda, counteggioAda)
print("\nRilevanza attributi Ada")
for nameAda, scoreAda in zip(COLUMNS, classificatore.feature_importances_):
def perform_learning(train_features,
                     test_features,
                     f_output,
                     local,
                     gglobal,
                     deep=False):
    print(train_features[0])
    if (local and gglobal):
        train = [x[1:-1] for x in train_features]
        test = [x[1:-1] for x in test_features]
    elif (local and not gglobal):
        train = [x[1:5] for x in train_features]
        test = [x[1:5] for x in test_features]
    elif (not local and gglobal):
        train = [x[5:-1] for x in train_features]
        test = [x[5:-1] for x in test_features]

    # print train[0]
    train_tags = [x[-1] for x in train_features]
    test_tags = [x[-1] for x in test_features]

    train = z_scoring(train)
    test = z_scoring(test)
    print(len(train[0]))
    print(train[0])
    if not deep:
        algos = ['adaBoost', 'RF', 'L-SVM', 'RBF-SVM', 'SGD']
        # algos = ['RBF-SVM']
        for algo in algos:
            print(algo)
            f_output.writelines(algo + '\n')

            if algo == 'adaBoost':
                clf = AdaBoostClassifier(n_estimators=100)
            if algo == 'RF':
                clf = RandomForestClassifier(n_estimators=1000,
                                             criterion="gini",
                                             min_samples_split=15,
                                             oob_score=True,
                                             class_weight='balanced',
                                             max_depth=3)
            if algo == 'L-SVM':
                clf = SVC(kernel='linear',
                          class_weight="balanced",
                          C=0.01,
                          probability=True)
            if algo == 'RBF-SVM':
                clf = SVC(class_weight="balanced", C=0.01, probability=True)
            if algo == 'SGD':
                clf = SGDClassifier(alpha=0.0001,
                                    average=False,
                                    class_weight=None,
                                    epsilon=0.1,
                                    eta0=0.0,
                                    fit_intercept=True,
                                    l1_ratio=0.15,
                                    learning_rate='optimal',
                                    loss='hinge',
                                    n_iter=5,
                                    n_jobs=1,
                                    penalty='l2',
                                    power_t=0.5,
                                    random_state=None,
                                    shuffle=True,
                                    verbose=0,
                                    warm_start=False)
        # print train
            clf.fit(train, train_tags)
            if (algo == 'RF'):
                print(len(clf.feature_importances_))
                print(clf.feature_importances_)
                f_output.writelines(str(clf.feature_importances_) + '\n')
            evaluate_auc(clf, test, test_tags, train, train_tags, f_output)

    else:
        print(train[0])

        from keras.models import Sequential
        from keras.layers import Dense, Dropout
        from keras.regularizers import l2, l1_l2

        clf = Sequential()
        clf.add(
            Dense(100,
                  activation="relu",
                  kernel_initializer="he_normal",
                  input_dim=train.shape[1]))
        # self.classifier.add(Dropout(0.5))
        # self.classifier.add(Dense(100, init='he_normal', activation='relu', W_regularizer=l2(0.5)))
        clf.add(Dropout(0.1))
        clf.add(
            Dense(1,
                  init='uniform',
                  activation='sigmoid',
                  W_regularizer=l1_l2(0.2)))
        clf.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])
        clf.fit(train,
                train_tags,
                validation_data=[test, test_tags],
                epochs=100,
                batch_size=10,
                verbose=2)

        evaluate_auc(clf, test, test_tags, train, train_tags, f_output)
예제 #38
0
Parameters = []
for max_depth in range(1,11):
    for n_estimators in range(1,31):
        ExtraTrees_model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth)
        ExtraTrees_model.fit(train_x, train_y)
        ExtraTrees_scores = np.mean(cross_val_score(ExtraTrees_model, train_x, train_y))
        Parameters.append(max_depth)
        Parameters.append(n_estimators)
        Parameters.append(ExtraTrees_scores)
score_table = pd.DataFrame(np.array(Parameters).reshape([-1,3]),columns=['max_depth','n_estimators','scores'])
score_table = score_table.sort_values(['scores'],ascending=False)[0:10]

#AdaBoosting
DecisionTree_model = DecisionTreeClassifier(max_depth=3)
AdaBoost_model = AdaBoostClassifier(base_estimator=DecisionTree_model)
AdaBoost_model.fit(train_x, train_y)
AdaBoost_scores = np.mean(cross_val_score(AdaBoost_model, train_x, train_y))

Parameters = []
for max_depth in range(1,11):
    for n_estimators in range(1,31):
        DecisionTree_model = DecisionTreeClassifier(max_depth=max_depth)
        AdaBoost_model = AdaBoostClassifier(base_estimator=DecisionTree_model,n_estimators=n_estimators)
        AdaBoost_model.fit(train_x, train_y)
        AdaBoost_scores = np.mean(cross_val_score(AdaBoost_model, train_x, train_y))
        Parameters.append(max_depth)
        Parameters.append(n_estimators)
        Parameters.append(AdaBoost_scores)
score_table = pd.DataFrame(np.array(Parameters).reshape([-1,3]),columns=['max_depth','n_estimators','scores'])
score_table = score_table.sort_values(['scores'],ascending=False)[0:10]
예제 #39
0
X_test, y_test = X[2000:], y[2000:]
X_train, y_train = X[:2000], y[:2000]

dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
dt_stump.fit(X_train, y_train)
dt_stump_err = 1.0 - dt_stump.score(X_test, y_test)

dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)
dt.fit(X_train, y_train)
dt_err = 1.0 - dt.score(X_test, y_test)

ada_discrete = AdaBoostClassifier(base_estimator=dt_stump,
                                  learning_rate=learning_rate,
                                  n_estimators=n_estimators,
                                  algorithm="SAMME")
ada_discrete.fit(X_train, y_train)

ada_real = AdaBoostClassifier(base_estimator=dt_stump,
                              learning_rate=learning_rate,
                              n_estimators=n_estimators,
                              algorithm="SAMME.R")
ada_real.fit(X_train, y_train)

fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot([1, n_estimators], [dt_stump_err] * 2,
        'k-',
        label='Decision Stump Error')
ax.plot([1, n_estimators], [dt_err] * 2, 'k--', label='Decision Tree Error')
"""
예제 #40
0
=====================
AdaBoost算法属于ensemble算法的boosting分支
其核心思想就是将一些偏差比较大(比较容易欠拟合)的分类器进行组合
用随机的方式消除偏差同时减小偏差。
"""
print(__doc__)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_moons, make_circles, make_classification
#引入训练数据
#X, y = make_circles(noise=0.2, factor=0.5, random_state=1)
X, y = make_moons(noise=0.1, random_state=1)
#定义AdaBoost分类器
adb = AdaBoostClassifier()
#训练过程
adb.fit(X, y)
#绘图库引入
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
#调整图片风格adbadb
mpl.style.use('fivethirtyeight')
#定义xy网格,用于绘制等值线图
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))
#预测可能性
Z = adb.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
예제 #41
0
def AdaBoost(X_train, X_test, y_train, y_test):
    from sklearn.ensemble import AdaBoostClassifier
    abc =  AdaBoostClassifier()
    abc.fit(X_train, y_train)
    print_report(abc,  'AdaBoost', X_train, X_test, y_train, y_test)
예제 #42
0
# Use scikit-learn's AdaBoostClassifier class. This class provides the functions to define and fit the model to your data.
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()
model.fit(x_train, y_train)
model.predict(x_test)
# Hyperparameters:
# we can specify the hyperparameters. most common are:
# base_estimator: The model utilized for the weak learners
# (Warning: Don't forget to import the model that you decide to use for the weak learner).
# n_estimators: The maximum number of weak learners used.

# exemple, we define a model which uses decision trees of max_depth 2 as the weak learners, and it allows a maximum of 4 of them.
from sklearn.tree import DecisionTreeClassifier
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                           n_estimators=4)
예제 #43
0
def main():
    data_percentage_array = np.linspace(0.1, 1, 10)
    full_data = load_data()
    x_train, x_test, y_train, y_test, X, y = split(full_data)

    dt = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=20, max_depth=5,
                                     min_samples_leaf=10, random_state=0)

    train_sizes, average_train_scores, average_test_scores = plot_learning_curve(dt, x_train, y_train, x_test, y_test, cv=10, train_sizes=data_percentage_array)
    rsultse = train_sizes, average_train_scores, average_test_scores

    plot = err_plot(np.linspace(0.1, 1, 10), average_train_scores, average_test_scores, "Decision Tree Classifier - Learning Curve")

    dt = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=4, max_leaf_nodes=18, max_depth=13,
                                     min_samples_leaf=14, random_state=1)

    dt = dt.fit(x_train,y_train)
    y_pred = dt.predict(x_test)
    print (metrics.classification_report(y_test, y_pred))
    print ('train accuracy: {}'.format(dt.score(x_train, y_train)))
    print ('test accuracy: {}'.format(dt.score(x_test, y_test)))

    decision_Tree_matrix = metrics.confusion_matrix(y_test, y_pred)
    print (decision_Tree_matrix)

    start_time = time.time()
    #Use this one
    bdt_real = AdaBoostClassifier(
        DecisionTreeClassifier(criterion="entropy", min_samples_split=11, max_depth=19,
                               min_samples_leaf=14, max_leaf_nodes=18, random_state=1),
        algorithm="SAMME",
        learning_rate=1)
    bdt_real.fit(x_train, y_train)
    print("--- %s seconds ---" % (time.time() - start_time))
    a, average_train_scores, average_test_scores = plot_learning_curve(bdt_real, x_train, y_train, x_test, y_test, cv=10, train_sizes=data_percentage_array)
    plot = err_plot(np.linspace(0.1, 1, 10), average_train_scores, average_test_scores,"Decision Tree Not Pruned - Learning Curve")

    print ('train accuracy: {}'.format(bdt_real.score(x_train, y_train)))
    print ('test accuracy: {}'.format(bdt_real.score(x_test, y_test)))
    _pred = bdt_real.predict(x_test)

    print (metrics.confusion_matrix(y_test, _pred))


    y_train_pred = bdt_real.predict(x_train)

    print ("test")
    print (metrics.classification_report(y_test, _pred, target_names=['No Diabetes', 'Diabetes']))
    print ("train")
    print (metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes']))


    column_names = ["preg","plas","pres","skin","insu","mass","pedi","age","class"]
    with open('data/pima-indians-diabetes copy.csv') as f:
        data = pandas.read_csv(f, sep=',', names=column_names)


#knn has sensitvity to irrelevent features, after seeing theses results I deceided to look at the feature importance to see if this had a factor
    gbc = ensemble.GradientBoostingClassifier()
    gbc.fit(X, y)
    # Get Feature Importance from the classifier
    feature_importance = gbc.feature_importances_
    # Normalize The Features
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.figure(figsize=(16, 12))
    plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6')
    plt.yticks(pos, np.asanyarray(data.columns.tolist())[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()



    bdt_discrete = AdaBoostClassifier(
    DecisionTreeClassifier(criterion="entropy", min_samples_split=11, max_depth=19,
                                  min_samples_leaf=14, max_leaf_nodes = 18, random_state=1),
    learning_rate=.3,
    algorithm="SAMME", n_estimators=10)
    bdt_discrete.fit(x_train, y_train)
    y_pred = bdt_discrete.predict(x_test)

    start_time = time.time()


    column_names = ["preg","plas","pres","skin","insu","mass","pedi","age","class"]
    with open('data/pima-indians-diabetes copy.csv') as f:
        data = pandas.read_csv(f, sep=',', names=column_names)

    X, y = data.iloc[:, :-1], data.iloc[:, -1]
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
    clf = svm.SVC(C=.1 ,kernel='linear', gamma = .001)
    clf.fit(x_train,y_train)
    learning_curve(clf,x_train,y_train)

    title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
    estimator = svm.SVC(degree=1, C=1,kernel='poly')


    plt.show()
    a, average_train_scores, average_test_scores = plot_learning_curve(estimator, x_train, y_train, x_test, y_test, cv=10,
                                                                       train_sizes=data_percentage_array)

    err_plot(np.linspace(0.1, 1, 10), average_train_scores, average_test_scores, 'SVM Linear Kernel')
    train_sizes, average_train_scores, average_test_scores = get_learning_curve(estimator, x_train, y_train,x_test,y_test)
    plot = err_plot( train_sizes, average_train_scores, average_test_scores)


    estimator.fit(x_train, y_train)

    print 'train accuracy: {}'.format(estimator.score(x_train, y_train))
    print 'test accuracy: {}'.format(estimator.score(x_test, y_test))
    print("--- %s seconds ---" % (time.time() - start_time))
def build_model():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)
    model = AdaBoostClassifier()
    result = model.fit(X_train, y_train)

    return model
예제 #45
0
    ncells_missclass = []

    abscissa.append(j)

    print(j)

    for i in range(10):
        training_features = training_features_vector[i]
        testing_features = testing_features_vector[i]
        training_target = training_target_vector[i]
        testing_target = testing_target_vector[i]

        # Training with Bagging
        boosted = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10),
                                     n_estimators=10)
        boosted.fit(training_features, training_target)

        # Comparing prediction with testing values
        prediction = boosted.predict(testing_features)

        # Number of missclassified cells
        lst = [item[j] for item in prediction]

        testing_target = np.array(testing_target)
        lst3t = [item[j] for item in testing_target]
        ncells_missclass.append(int(150 * mean_squared_error(lst, lst3t)))

    location.append(np.argmin(ncells_missclass[0:10]))
    minimum.append(min(ncells_missclass))

print(location)
예제 #46
0
#feature vectors for training emails & its labels
train_labels = np.zeros(702)
train_labels[351:701] = 1
train_matrix = extract_features(train_dir)

#training SVM, Ensemble and Naive Bayes classifiers
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = RandomForestClassifier()
model4 = AdaBoostClassifier()

model1.fit(train_matrix, train_labels)
model2.fit(train_matrix, train_labels)
model3.fit(train_matrix, train_labels)
model4.fit(train_matrix, train_labels)

#test unseen mail for spam
test_dir = 'C:\\Users\\100557540\\Google Drive\\Pranav\\IITROPAR\\Dr. Puneet Goyal\\Main papers to implement\\text mining for phising email detetcion\\test-mails'
test_matrix = extract_features(test_dir)
test_labels = np.zeros(260)
test_labels[130:260] = 1

result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)
result3 = model3.predict(test_matrix)
result4 = model4.predict(test_matrix)

#confusion matrix for SVM and Naive Bayes models
print(confusion_matrix(test_labels, result1))
print(confusion_matrix(test_labels, result2))
예제 #47
0
testX = [r[1:] for r in testset]
testY = [r[0] for r in testset]

for max_depth in x_list:
    newOut = []
    for max_leaf_nodes in y_list:
        fileout = "MaxDepth" + str(max_depth)+"MaxLeaves" + str(max_leaf_nodes)
        clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=max_depth, max_leaf_nodes=max_leaf_nodes)
        clf = AdaBoostClassifier(base_estimator=clf, n_estimators=1000, learning_rate=1.0, algorithm='SAMME.R')
        # clf = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(), random_state=1, max_iter=1000)
        # clf = svm.SVC(gamma='scale')

        newTrainY = []
        for r in trainY:
            newTrainY = newTrainY + [[r]]
        clf = clf.fit((trainX), np.array(newTrainY).ravel())

        newOut += [accuracy_score(testY, clf.predict(testX))]
        csvout += [[str(max_depth), str(max_leaf_nodes), str(accuracy_score(testY, clf.predict(testX)))]]

        print(accuracy_score(testY, clf.predict(testX)))
        print(str(accuracy_score(testY, clf.predict(testX), normalize=False)) + " correct of " + str(len(testX)))
        filename = "../letter-classification/boosting/" + fileout + "/output.txt"
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with open(filename, "w") as f:
            f.write(str(accuracy_score(testY, clf.predict(testX))) + "\n")
            f.write(str(accuracy_score(testY, clf.predict(testX), normalize=False)) + " correct of " + str(len(testX)))

        def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                                n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
            """
예제 #48
0
knn_opt.fit(train_x, train_y)
test_y_knn = knn_opt.predict_proba(test_x)

knn_out = submission
knn_out['target'] = test_y_knn

knn_out['target'] = 1 - knn_out['target']
knn_out.to_csv('knn_predictions1.csv', index=False, float_format='%.4f')

ada_opt = AdaBoostClassifier(algorithm='SAMME.R',
                             base_estimator=None,
                             learning_rate=1.0,
                             n_estimators=200,
                             random_state=None)

ada_opt.fit(train_x, train_y)
test_y_ada = ada_opt.predict_proba(test_x)

ada_out = submission
ada_out['target'] = test_y_ada
ada_out['target'] = 1 - ada_out['target']

ada_out.to_csv('ada_predictions1.csv', index=False, float_format='%.4f')

gb_opt = GradientBoostingClassifier(criterion='friedman_mse',
                                    init=None,
                                    learning_rate=0.1,
                                    loss='deviance',
                                    max_depth=3,
                                    max_features=None,
                                    max_leaf_nodes=None,
예제 #49
0
clf_svm.fit(train_X, train_Y)
output_svm = clf_svm.predict(test_X)

print('GradientBoosting model prediction')
clf_GB = GradientBoostingClassifier()
clf_GB.fit(train_X, train_Y)
output_GB = clf_GB.predict(test_X)

print('RandomForest model prediction')
clf_RF = RandomForestClassifier()
clf_RF.fit(train_X, train_Y)
output_RF = clf_RF.predict(test_X)

print('AdaBoost model prediction')
clf_Ada = AdaBoostClassifier()
clf_Ada.fit(train_X, train_Y)
output_Ada = clf_Ada.predict(test_X)

True_Positive = 0
True_Negative = 0
False_Positive = 0
False_Negative = 0

for i in range(len(output_svm)):
    ensemble_output = float(output_svm[i] + output_Ada[i] + output_RF[i] +
                            output_GB[i]) / 4
    if (ensemble_output >= 0.5):
        ensemble_output = 1
    else:
        ensemble_output = 0
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


# In[7]:


X=df.drop('electricity_consumption_category', axis=1)
y=df['electricity_consumption_category']

from sklearn.cross_validation import train_test_split
#分割訓練和測試集
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=101)

from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred= clf.predict(X_test)
print(y_pred)


# In[8]:


for estimator in clf.estimators_:
    print(estimator.predict(X_test))
    print(clf.estimator_errors_)

예제 #51
0
        X_train, X_test, y_train, y_test = train_test_split(df_all.drop(
            "label", axis=1),
                                                            df_all["label"],
                                                            test_size=0.2)

        rf_model = RandomForestClassifier()
        adb_model = AdaBoostClassifier()
        et_model = ExtraTreesClassifier()
        lgb_model = lgb.LGBMClassifier()
        lr_model = LogisticRegression()
        gbdt_model = GradientBoostingClassifier()
        Dt_model = DecisionTreeClassifier()

        rf_model.fit(X_train, y_train)
        adb_model.fit(X_train, y_train)
        et_model.fit(X_train, y_train)
        lgb_model.fit(X_train, y_train)
        lr_model.fit(X_train, y_train)
        gbdt_model.fit(X_train, y_train)
        Dt_model.fit(X_train, y_train)
        print(
            'rf_model:',
            round(
                metrics.f1_score(y_test,
                                 rf_model.predict(X_test),
                                 average='weighted'), 4))
        print(
            'adb_model:',
            round(
                metrics.f1_score(y_test,
# 数组拼接 
X = np.concatenate((X1, X2)) # 500*2
# print(np.shape(X1), np.shape(X2), np.shape(X))
y = np.concatenate((y1, - y2 + 1))

# Create and fit an AdaBoosted decision tree
# SAMME和SAMME.R。两者的主要区别是弱学习器权重的度量,SAMME使用了和我们的原理篇里二元分类Adaboost算法的扩展,
# 即用对样本集分类效果作为弱学习器权重,而SAMME.R使用了对样本集分类的预测概率大小来作为弱学习器权重。
# 由于SAMME.R使用了概率度量的连续值,迭代一般比SAMME快,因此AdaBoostClassifier的默认算法algorithm的值也是SAMME.R。
# 我们一般使用默认的SAMME.R就够了,但是要注意的是使用了SAMME.R, 则弱分类学习器参数base_estimator必须限制使用支持概率预测的分类器。
# SAMME算法则没有这个限制。
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)

bdt.fit(X, y)

plot_colors = "br"
plot_step = 0.02
class_names = "AB"

plt.figure(figsize=(10, 5))

# Plot the decision boundaries
plt.subplot(121)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # X shape 500 * 2
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                     np.arange(y_min, y_max, plot_step))

# # ravel() 和 flatten()函数,将多维数组降为一维,ravel返回视图,flatten返回拷贝
print("\n\n\n")

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

#%%     AdaBoost

from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                             n_estimators=200,
                             algorithm="SAMME.R",
                             learning_rate=0.5)

ada_clf.fit(X_train, y_train)

y_pred = ada_clf.predict(X_test)

print("\n\nAccuracy: ", ada_clf.__class__.__name__, " : ",
      accuracy_score(y_test, y_pred))

#%%     Gradient Boosting

from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2,
                                 n_estimators=3,
                                 learning_rate=1.0)
gbrt.fit(X, y)
예제 #54
0
X=df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values

le=LabelEncoder()
y=le.fit_transform(y)
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

tree=DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=1)
ada=AdaBoostClassifier(base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=1)
tree=tree.fit(X_train, y_train)
y_train_pred=tree.predict(X_train)
y_test_pred=tree.predict(X_test)
tree_train=accuracy_score(y_train, y_train_pred)
tree_test=accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test))

ada=ada.fit(X_train, y_train)
y_train_pred=ada.predict(X_train)
y_test_pred=ada.predict(X_test)
ada_train=accuracy_score(y_train, y_train_pred)
ada_test=accuracy_score(y_test, y_test_pred)
print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test))

# plotting the decision regions
x_min=X_train[:,0].min()-1
x_max=X_train[:,0].max()+1
y_min=X_train[:,1].min()-1
y_max=X_train[:,1].max()+1
xx, yy=np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(8,3))
for idx, clf, tt in zip([0,1], [tree,ada], ['Decision Tree', 'AdaBoost']):
    clf.fit(X_train, y_train)
        dct_cr_zigzag.append(diag_cr)
        flip = not flip
    return np.concatenate([
        np.concatenate(dct_y_zigzag),
        np.concatenate(dct_cb_zigzag),
        np.concatenate(dct_cr_zigzag)
    ])


actual = [
    "Cubism", "Impressionism", "Pop Art", "Pop Art", "Impressionism", "Cubism",
    "Cubism", "Cubism", "Impressionism", "Pop Art", "Pop Art", "Impressionism",
    "Realism", "Realism", "Realism", "Realism"
]
model = AdaBoostClassifier(n_estimators=200)
model.fit(training_data, responses)
i = 0
image_lbp = []
image_csd = []
data = []
labels = []

for filename in glob.glob(
        'C:\Users\dutta\Desktop\Project\Images\Test\T/*.jpg'):  #assuming gif
    image = cv2.imread(filename, 0)
    image_lbp.append(image)
    img = cv2.imread(filename)
    image_csd.append(img)

for img1, img2 in zip(image_lbp, image_csd):
    h = localbinarypattern(img1)
예제 #56
0
# print different accuracy measurements
print 'accuracy score: ', logit.score(x_test, y_test)
print 'precision:', precision_score(y_test,
                                    logit.predict(x_test),
                                    average='weighted')
print 'recall:', recall_score(y_test,
                              logit.predict(x_test),
                              average='weighted')
print 'mean cross validation score:', np.mean(
    cross_val_score(logit, pd.concat([x_train, x_test]),
                    pd.concat([y_train, y_test])))

# apply adaboost for no of estimators 1 to 20 and print accuracies
for i in range(1, 20):
    clf = AdaBoostClassifier(n_estimators=i, base_estimator=logit)
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    precision = precision_score(y_test,
                                clf.predict(x_test),
                                average='weighted')
    recall = recall_score(y_test, clf.predict(x_test), average='weighted')
    cross_val_mean = np.mean(
        cross_val_score(clf, pd.concat([x_train, x_test]),
                        pd.concat([y_train, y_test])))
    print i, score, precision, recall, cross_val_mean

# initial values accuracy, precision, recall, cross validation mean
# 0.741935483871 0.814616332408 0.741935483871 0.724832214765

# after boosting
예제 #57
0
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################


### your code here!  name your classifier object clf if you want the 
### visualization code (prettyPicture) to show you the decision boundary
from sklearn.ensemble import AdaBoostClassifier 
from time import time 
clf = AdaBoostClassifier() 

t_train = time()
clf.fit(features_train, labels_train) 
print "Training time: %f s." % round(time() - t_train, 3) 

t_pred = time() 
pred = clf.predict(features_test) 
print "Predicting time: %f s." % round(time() - t_pred, 3) 

from sklearn.metrics import accuracy_score 
acc = accuracy_score(pred, labels_test) 
print "Accuracy: %f. " % acc 


try:
    prettyPicture(clf, features_test, labels_test)
except NameError:
    pass
              'GaussianNB',
              'KNeighborsClassifier', 
              'LogisticRegression']

# %%
for index,model in enumerate(models):
    try:
        model.fit(trainX_tf,trainY_tf)
        print(modelnames[index],"Accuracy =",round(model.score(testX_tf,testY_tf)*100,2),"%")
    except:
        print("Skipped",modelnames[index])

# %%
#  base_estimator=DecisionTreeClassifier
ab = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=1000)
ab.fit(trainX_tf,trainY_tf)
print('AdaBoost Accuracy with Decision Tree (Scaled Data)= ',(ab.score(testX_tf,testY_tf)*100))

# %%
#  base_estimator=RandomForest
ab = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=1000,random_state=10),n_estimators=1000)
ab.fit(trainX_tf,trainY_tf)
print('AdaBoost Accuracy with Random Forest (Scaled Data)= ',(ab.score(testX_tf,testY_tf)*100))

# %%
#  base_estimator=LogisticRegression
ab = AdaBoostClassifier(base_estimator=LogisticRegression(max_iter=1000,solver = 'lbfgs'),n_estimators=1000)
ab.fit(trainX_tf,trainY_tf)
print('AdaBoost Accuracy with Logistic Reg (Scaled Data)= ',(ab.score(testX_tf,testY_tf)*100))

# %%
예제 #59
0
#KNN
from sklearn.neighbors import KNeighborsClassifier

rf6 = KNeighborsClassifier()
rf6.fit(X_train, y_train)
y_val_pred6 = rf6.predict_proba(X_val)
y_val_pred_acc6 = rf6.predict(X_val)
print(log_loss(y_val, y_val_pred6))
print(accuracy_score(y_val, y_val_pred_acc6))

#AdaBoost
from sklearn.ensemble import AdaBoostClassifier

rf7 = AdaBoostClassifier(n_estimators=250)
rf7.fit(X_train, y_train)
y_val_pred6 = rf7.predict_proba(X_val)
y_val_pred_acc7 = rf7.predict(X_val)
print(log_loss(y_val, y_val_pred7))
print(accuracy_score(y_val, y_val_pred_acc7))

#Compare ROC of each Algorithm
import matplotlib.pyplot as plt
from sklearn import metrics
#RandomForest
fpr1, tpr1, threshold1 = metrics.roc_curve(y_val_pred_acc1, y_val_pred1)
roc_auc1 = metrics.auc(fpr1, tpr1)
plt.title('ROC of RandomForest')
plt.plot(fpr1, tpr1, 'b', label='AUC = %0.2f' % roc_auc1)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
        result_file.write(f"Data ratio: {DATA_FRAC}\n")
        result_file.write(f"Test ratio: {TEST_SIZE}\n")
        result_file.write(
            f"Sample size (test-train): {len(test_X)}-{len(train_X)}\n")
        result_file.write(f"Threshold: {THRESHOLD}\n")
        result_file.write(f"\n#,parameters,accuracy,finish_time(s)\n")

    #--- C and GAMMA TRAINING STARTS ---#
    i = 0
    for n in PARAMETERS:
        print(f"{i}:\t{n}\tstarting...")
        clf = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=2), **n)

        t0 = time.time()
        clf.fit(train_X, train_y)  # training
        dT = time.time() - t0
        test_results = clf.predict(test_X)  # test
        acc = float(np.count_nonzero(test_results == test_y)) / len(test_y)

        if WRITE_FILE:
            result_file.write(f"{i},{n},{acc:.5f},{dT:.2f}\n")
            result_file.flush()
        i += 1

        train_data, test_data = model_selection.train_test_split(
            tweets, test_size=TEST_SIZE)

        train_X = np.array(train_data[X_column_names])
        train_y = np.array(train_data[Y_column_name])