예제 #1
1
class Ensemble:

	def __init__(self, data):
		self.rf = RandomForestClassifier(n_estimators=80, n_jobs=-1, min_samples_split=45, criterion='entropy')
		self.lda = LDA()
		self.dec = DecisionTreeClassifier(criterion='entropy')
		self.ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.25)

		self.make_prediction(data)


	def make_prediction(self, data):
		'''
		Make an ensemble prediction
		'''
		self.rf.fit(data.features_train, data.labels_train)
		self.lda.fit(data.features_train, data.labels_train)
		self.dec.fit(data.features_train, data.labels_train)
		self.ada.fit(data.features_train, data.labels_train)

		pre_pred = []
		self.pred = []

		ada_pred = self.ada.predict(data.features_test)
		rf_pred = self.rf.predict(data.features_test)
		lda_pred = self.lda.predict(data.features_test)
		dec_pred = self.dec.predict(data.features_test)

		for i in range(len(rf_pred)):
			pre_pred.append([ rf_pred[i], lda_pred[i], dec_pred[i], ada_pred[i] ])

		for entry in pre_pred:
			pred_list = sorted(entry, key=entry.count, reverse=True)
			self.pred.append(pred_list[0])
예제 #2
0
파일: AdaBoost.py 프로젝트: AravindRam/ML
def Adaboost(TrainData,TestData):
    features=['Time','Season','Hour','Minute','District']

    clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30)

    size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    for i in range(0,len(size)):
        train,validation= train_test_split(TrainData, train_size=size[i])

        while len(set(train['Category'])) != len(set(validation['Category'])):
            train,validation= train_test_split(TrainData, train_size=size[i])
        clf = clf.fit(train[features], train['Category'])
        """stop = timeit.default_timer()
        print "Runnin  time adaboost is ", stop-start"""
        predicted=np.array(clf.predict_proba(validation[features]))
        model=clf.predict(train[features])
        model1=clf.predict(validation[features])

        #scores = cross_val_score(clf, validation[features], validation['Category'])
        #print "Scores mean is",scores.mean()
        #accuracy
        print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model)
        print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1)
        print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None)


        #writing to file
        """Category_new=[]
예제 #3
0
def test_classifiers2(data, ind):
    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators=100)
    clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1])
    print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1])
    out = clf.predict(data[ind[1000:], :-1])
    print(confusion_matrix(data[ind[1000:], -1], out))

    from sklearn.ensemble import GradientBoostingClassifier
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
    clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1])
    print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1])
    out = clf.predict(data[ind[1000:], :-1])
    print(confusion_matrix(data[ind[1000:], -1], out))

    from sklearn.neural_network import MLPClassifier
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10), random_state=1)
    clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1])
    print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1])
    out = clf.predict(data[ind[1000:], :-1])
    print(confusion_matrix(data[ind[1000:], -1], out))

    import xgboost as xgb
    xgb_model = xgb.XGBClassifier().fit(data[ind[:1000], :-1], data[ind[:1000], -1])
    out = xgb_model.predict(data[ind[1000:], :-1])
    a = confusion_matrix(data[ind[1000:], -1], out)
    print float(a[0, 0] + a[1, 1]) / np.sum(a)
    print a
예제 #4
0
def test_staged_predict():
    """Check staged predictions."""
    # AdaBoost classification
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, n_estimators=10)
        clf.fit(iris.data, iris.target)

        predictions = clf.predict(iris.data)
        staged_predictions = [p for p in clf.staged_predict(iris.data)]
        proba = clf.predict_proba(iris.data)
        staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
        score = clf.score(iris.data, iris.target)
        staged_scores = [s for s in clf.staged_score(iris.data, iris.target)]

        assert_equal(len(staged_predictions), 10)
        assert_array_almost_equal(predictions, staged_predictions[-1])
        assert_equal(len(staged_probas), 10)
        assert_array_almost_equal(proba, staged_probas[-1])
        assert_equal(len(staged_scores), 10)
        assert_array_almost_equal(score, staged_scores[-1])

    # AdaBoost regression
    clf = AdaBoostRegressor(n_estimators=10)
    clf.fit(boston.data, boston.target)

    predictions = clf.predict(boston.data)
    staged_predictions = [p for p in clf.staged_predict(boston.data)]
    score = clf.score(boston.data, boston.target)
    staged_scores = [s for s in clf.staged_score(boston.data, boston.target)]

    assert_equal(len(staged_predictions), 10)
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert_equal(len(staged_scores), 10)
    assert_array_almost_equal(score, staged_scores[-1])
예제 #5
0
class Model_Adaboost(object):
    def __init__(self,model,parameter = {"n_estimators" : 50, "CV_size": 0}):
        self.train = model.train
        self.test = model.test
        self.CVsize = float(parameter["CV_size"].get())
        train = np.array(self.train)
        self.X_train = train[:, :-1]
        self.y_train = train[:, -1]
        self.X_train,self.X_CV,self.y_train,self.y_CV = train_test_split(self.X_train, self.y_train, test_size=self.CVsize)
        if self.CVsize == 0:
            self.clf = AdaBoostClassifier(n_estimators = int(parameter["n_estimators"].get()))
        self.model = model

    def fit(self):
        self.clf.fit(self.X_train,self.y_train)

    def score(self):
        pre = self.clf.predict(self.X_train)
        truth = self.y_train
        print ("score: " + str(self.clf.score(self.X_train,truth)))
        print ("f1: " + str(f1_score(truth,pre, average=None)))
        print ("AUC score: " + str(roc_auc_score(truth,pre)))

    def save_results(self):
        pre = self.model.clf.predict(self.model.test)
        df = pd.DataFrame({"predict":pre})
        fileName = tkFileDialog.asksaveasfilename()
        df.to_csv(fileName)

    def crossValidation(self):
        estimatorList = [3,5,7,10,13,15,20,25,30,50]
        bestScore = [0,0] #score,n_estimator
        bestF1ScoreNeg = [0,0]
        bestF1ScorePos = [0,0]
        #bestAUCScore = [0,0]
        for e in estimatorList:
            self.clf = AdaBoostClassifier(n_estimators = e)
            self.clf.fit(self.X_train,self.y_train)
            pre = self.clf.predict(self.X_CV)
            truth = self.y_CV
            score = self.clf.score(self.X_CV,truth)
            if score > bestScore[0]:
                bestScore[0] = score
                bestScore[1] = e

            f1pos = f1_score(truth,pre, average=None)[1]
            if f1pos > bestF1ScorePos[0]:
                bestF1ScorePos[0] = f1pos
                bestF1ScorePos[1] = e

            f1neg = f1_score(truth,pre, average=None)[0]
            if f1neg > bestF1ScoreNeg[0]:
                bestF1ScoreNeg[0] = f1neg
                bestF1ScoreNeg[1] = e

        print ("Adaboost:")
        print ("Best [score,n_estimators] on Cross Validation set: " + str(bestScore))
        print ("Best [f1(pos),n_estimators] on Cross Validation set: " + str(bestF1ScorePos))
        print ("Best [f1(neg),n_estimators] on Cross Validation set" + str(bestF1ScoreNeg))
def AdaBoost(xtrain, xtest, ytrain, ytest):
    depth=75
    model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=depth)
    model.fit(xtrain, ytrain)
    print 'Adaboost with depth %d' %depth 
    print 'Test Performance'
    eval(ytest, model.predict(xtest))
    print 'Train Performance'
    eval(ytrain, model.predict(xtrain))
예제 #7
0
def ada_boost(X,y, nf = 2, ne = 50, lr=1):
    y = y.astype(float)
    Xs = X.astype(float)
    col_names = X.columns
    Xs_t, Xs_holdout, y_t, y_holdout = train_test_split(Xs, y, train_size=.8)
    Xs_t = Xs_t.set_index([range(len(Xs_t))])
    Xs_holdout = Xs_holdout.set_index([range(len(Xs_holdout))])
    y_t = pd.DataFrame(y_t).set_index([range(len(y_t))])
    y_holdout = pd.DataFrame(y_holdout).set_index([range(len(y_holdout))])

    kf = KFold(len(Xs_t), nf)

    output_table = []
    precisions = []
    accuracies = []
    F1s = []
    fold_count = 1
    for train_index, test_index in kf:
        results = []
        Xs_train, Xs_test = Xs_t.iloc[train_index,:], Xs_t.iloc[test_index,:]
        y_train, y_test = y_t.iloc[train_index,:], y_t.iloc[test_index,:]
        y_train = np.array(y_train)
        y_test = np.array(y_test)
        my_ada = AdaBoostClassifier(n_estimators=ne, learning_rate = lr)
        my_ada.fit(Xs_train, y_train)
        pred = my_ada.predict(Xs_test)
        pred = np.array(pred)
        output_table.append(' ')
        output_table.append("Fold "+ str(fold_count) + ':')
        output_table.append("Precision Score: "+str(precision_score(pred, y_test)))
        output_table.append("Accuracy Score: "+ str(accuracy_score(pred, y_test)))
        output_table.append("F1 Score: "+str(f1_score(pred, y_test)))
        precisions.append(precision_score(pred, y_test))
        accuracies.append(accuracy_score(pred, y_test))
        F1s.append(f1_score(pred, y_test))
        fold_count += 1
    pred_holdout = my_ada.predict(Xs_holdout)
    pred_holdout = np.array(pred_holdout)
    cm = confusion_matrix(y_holdout, pred_holdout)
    TN = cm[0][0]
    FN = cm[0][1]
    TP = cm[1][1]
    FP = cm[1][0]
    print "Mean Precision: ", np.mean(precisions)
    print "Mean F1s: ", np.mean(F1s)
    print "True Positive Rate (Sensitivity): ", TP*1./(TP+FN)#cm[1][1]*1./(cm[1][1]+cm[0][1])
    print "True Negative Rate (Specificity): ", TN*1./(TN+FP)#cm[0][0]*1./(cm[0][0]+cm[1][0])
    print "Precision: ", TP*1./(TP+FP), #precision_score(pred_holdout, y_holdout)
    print "Accuracy: ", (TP+TN)*1./(TP+TN+FP+FN), #accuracy_score(pred_holdout, y_holdout)
    indices = np.argsort(my_ada.feature_importances_)
    figure = plt.figure(figsize=(10,7))
    plt.barh(np.arange(len(col_names)), my_ada.feature_importances_[indices],
             align='center', alpha=.5)
    plt.yticks(np.arange(len(col_names)), np.array(col_names)[indices], fontsize=14)
    plt.xticks(fontsize=14)
    _ = plt.xlabel('Relative importance', fontsize=18)
    return my_ada
예제 #8
0
def eval(ds, testNum, p, splitProportion=0.2):
    #testNum=1
    #splitProportion=0.2
    
    allFeaturesF1=[]
    allFeaturesRecall=[]
    allFeaturesPrecision=[]
    
    featureSelctedF1=[]
    featureSelctedRecall = []
    featureSelctedPrecision = []
    
    for _ in range(testNum):
        tstdata, trndata = ds.splitWithProportion( splitProportion )
        X, Y = labanUtil.fromDStoXY(trndata)
        X_test, Y_test = labanUtil.fromDStoXY(tstdata)
        #localF1s = []
        #localRecalls = []
        #localPercisions = []
        for y, y_test in zip(Y, Y_test):
            if all(v == 0 for v in y):
                continue
            #clf = LinearSVC()#fit_intercept=True, C=p)
            #clf.sparsify()
            
            #clf = RandomForestClassifier()#criterion='entropy')
            #clf = tree.DecisionTreeClassifier()#max_depth=p)
            clf = AdaBoostClassifier()
            #clf = GradientBoostingClassifier()#, learning_rate=lr)
            #clf = ExtraTreesClassifier(n_estimators=p)
                        
            #svc = LinearSVC()
            #selector = RFE(estimator=svc, n_features_to_select=p*19, step=0.2)
            selector = SelectPercentile(chooser, percentile=p)
            
            selector.fit(X, y)
            name = str(clf).split()[0].split('(')[0]
            clf.fit(selector.transform(X), y)
            pred = clf.predict(selector.transform(X_test))
            
            featureSelctedF1.append(metrics.f1_score(y_test, pred))
            featureSelctedRecall.append(metrics.recall_score(y_test, pred))
            featureSelctedPrecision.append(metrics.precision_score(y_test, pred)) 
            
            clf.fit(X, y)
            pred = clf.predict(X_test)
            
            allFeaturesF1.append(metrics.f1_score(y_test, pred))
            allFeaturesRecall.append(metrics.recall_score(y_test, pred))
            allFeaturesPrecision.append(metrics.precision_score(y_test, pred))

    return np.mean(allFeaturesF1), np.mean(featureSelctedF1), \
        np.mean(allFeaturesRecall), np.mean(featureSelctedRecall), \
        np.mean(allFeaturesPrecision), np.mean(featureSelctedPrecision), \
        name
예제 #9
0
def AdaBC(train,test,train_target,test_target,weights=None, n=500, lr = 1):
    abc = AdaBoostClassifier(n_estimators = n, learning_rate = lr)
    abc.fit(train, train_target, sample_weight = weights)
    res = abc.predict(train)
    
    print '*************************** AdaBC ****************'
    print classification_report(train_target,res)
    
    res1 = abc.predict(test)
    print classification_report(test_target,res1)
    return abc
예제 #10
0
def test_adaboost_classifier(train_test_sets):
    """ Adaboost Classifier with Decision Tree Stumps. """
    X_train, X_test, y_train, y_test = train_test_sets
    clf = AdaBoostClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_train)
    print "ADABOOST CLASSIFIER RESULTS"
    print "\tTraining accuracy is ", metrics.accuracy_score(y_train, y_pred, normalize=True)

    y_pred = clf.predict(X_test)
    print_metrics(y_test, y_pred)
예제 #11
0
파일: main.py 프로젝트: gabrielfarah/Kaggle
def perform_emsamble_model():
    #get data from csv file
    x , y_votes, y_comments, y_views, lat = read_train_data()
    #transform to nunpy data type array for better usage
    y_votes = np.array(y_votes)
    y_comments = np.array(y_comments)
    y_views = np.array(y_views)
    #get test data
    x_test, ids, lat = read_test_data()
    #Change the parameters from the objects with the values from gridsearch
    vec_votes = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2)
    vec_comments = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2)
    vec_views = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2)
    #transfor x and x_test in a TFIDF matrix for feeding to the classifier
    x_votes = vec_votes.fit_transform(x)
    x_comments = vec_comments.fit_transform(x)
    x_views = vec_views.fit_transform(x)
    x_test_transformed_votes = vec_votes.transform(x_test)
    x_test_transformed_comments = vec_comments.transform(x_test)
    x_test_transformed_views = vec_views.transform(x_test)
    print "TFIDF Matrixes generated"
    print " LSA transforming"
    lsa_votes = TruncatedSVD(500)
    lsa_comments = TruncatedSVD(500)
    lsa_views = TruncatedSVD(500)
    x_votes = lsa_votes.fit_transform(x_votes)
    print "LSA Votes Done.."
    print
    x_comments = lsa_comments.fit_transform(x_comments)
    print "LSA Comments Done.."
    print
    x_views = lsa_views.fit_transform(x_views)
    print "LSA Views Done.."
    print
    x_test_transformed_votes = lsa_votes.transform(x_test_transformed_votes)
    x_test_transformed_comments = lsa_comments.transform(x_test_transformed_comments)
    x_test_transformed_views = lsa_views.transform(x_test_transformed_views)
    print "SLA Finished.."
    ada_votes = AdaBoostClassifier(base_estimator=RandomForestClassifier())
    ada_comments = AdaBoostClassifier(base_estimator=RandomForestClassifier())
    ada_views = AdaBoostClassifier(base_estimator=RandomForestClassifier())
    ada_votes.fit(x_votes, y_votes)
    ada_comments.fit(x_comments, y_comments)
    ada_views.fit(x_views, y_views)
    print "Fitting done"
    print
    #predict number of votes 
    pred_votes = ada_votes.predict(x_test_transformed_votes)
    pred_comments = ada_comments.predict(x_test_transformed_comments)
    pred_views = ada_views.predict(x_test_transformed_views)
    #generate submission response csv file
    create_csv_response(len(x_test), ids, pred_views, pred_votes, pred_comments)
def AdaBoost(X, Y, XTest, YTest):
    print '-----------------------------------------------------'

    # param_grid = {'learning_rate': [0.1, 0.3, 0.6, 1, 3, 6, 10]}

    # tree_grid = GridSearchCV(AdaBoostClassifier(), param_grid)
    tree_grid = AdaBoostClassifier(n_estimators=100, learning_rate=2)
    tree_grid.fit(X, Y)

    # print("The best parameters are %s with a score of %0.2f"
    #       % (tree_grid.best_params_, tree_grid.best_score_))

    print "Computing training statistics"
    dtree_predict_time_training = time.time()
    Ypred_dtree_training = tree_grid.predict(X)
    dtree_predict_time_training = time.time() - dtree_predict_time_training

    dtree_accuracy_training = metrics.accuracy_score(Y, Ypred_dtree_training)
    dt_precision_training = metrics.precision_score(Y, Ypred_dtree_training,
                                                    average='binary')
    dtree_recall_training = metrics.recall_score(Y, Ypred_dtree_training,
                                                 average='binary')

    print "DT training prediction time: " + str(dtree_predict_time_training)
    print "DT training accuracy Score: " + str(dtree_accuracy_training)
    print "DT training precision Score: " + str(dt_precision_training)
    print "DT training recall Score: " + str(dtree_recall_training)

    print "Computing testing statistics"
    dtree_predict_time_test = time.time()
    Ypred_dtree_test = tree_grid.predict(XTest)
    dtree_predict_time_test = time.time() - dtree_predict_time_test

    dtree_accuracy_test = metrics.accuracy_score(YTest, Ypred_dtree_test)
    dt_precision_test = metrics.precision_score(YTest, Ypred_dtree_test,
                                                average='binary')
    dtree_recall_test = metrics.recall_score(YTest, Ypred_dtree_test,
                                             average='binary')

    print "DT test prediction time: " + str(dtree_predict_time_test)
    print "DT test accuracy Score: " + str(dtree_accuracy_test)
    print "DT test precision Score: " + str(dt_precision_test)
    print "DT test recall Score: " + str(dtree_recall_test)

    print "Creating ROC curve"
    y_true = YTest
    y_score = tree_grid.predict_proba(XTest)
    fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true,
                                          y_score=y_score[:, 0],
                                          pos_label=0)
    plt.plot(fprSVM, trpSVM, 'c-', label='ADA')
예제 #13
0
def experiment_estimators_AdaBoostRandomForest():
    avgError = []
    x_learners = []
    rf = RandomForestClassifier(n_estimators=maxLearners, max_depth = maxDepth, warm_start = False)
    for k_estimators in range(10,150,10):
        k = 10
        skf = StratifiedKFold(labels,n_folds=k)
        averageError = 0.0
        for train_index, test_index in skf:
            X_train, X_test = mfcc[:,train_index], mfcc[:,test_index]
            y_train, y_test = labels[train_index], labels[test_index]
            adb = AdaBoostClassifier(base_estimator=rf, n_estimators=k_estimators, learning_rate=0.01)
            adb.fit(X_train.T,y_train)
            y_pred = adb.predict(X_test.T)
            error = zero_one_loss(y_pred,y_test)
            print error
            averageError += (1./k) * error
        print "Average error: %4.2f%s" % (100 * averageError,'%')
        avgError.append(averageError)
        x_learners.append(k_estimators)
    # graph the errors now.
    plt.plot(x_learners, avgError)
    plt.ylabel('Average Error (k=10)')
    plt.xlabel('Number of Estimators')
    plt.title('Error as a function of the number of estimators')
    plt.show()
예제 #14
0
    def Bootstrap_method(self):
        rs = cross_validation.ShuffleSplit(
            len(self.FeatureSet), 10, 0.25, random_state=0)
        clf = tree.DecisionTreeClassifier()
        for train_index, test_index in rs:
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for trainid in train_index.tolist():
                X_train.append(self.FeatureSet[trainid])
                y_train.append(self.Label[trainid])

            for testid in test_index.tolist():
                X_test.append(self.FeatureSet[testid])
                y_test.append(self.Label[testid])

            #clf = clf.fit(X_train, y_train)
           # pre_labels = clf.predict(X_test)
            clf = AdaBoostClassifier(n_estimators=100)
            clf = clf.fit(X_train, y_train)
            pre_labels = clf.predict(X_test)
            # Modeal Evaluation
            ACC = metrics.accuracy_score(y_test, pre_labels)
            MCC = metrics.matthews_corrcoef(y_test, pre_labels)
            SN = self.performance(y_test, pre_labels)
            print ACC,SN
예제 #15
0
    def KFold_method(self):
        
        kf = KFold(n_splits=10)
        for train_index, test_index in kf.split(self.FeatureSet):
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for trainid in train_index.tolist():
                X_train.append(self.FeatureSet[trainid])
                y_train.append(self.Label[trainid])

            for testid in test_index.tolist():
                X_test.append(self.FeatureSet[testid])
                y_test.append(self.Label[testid])
            #clf = tree.DecisionTreeClassifier()        
            #clf = clf.fit(X_train, y_train)
            #pre_labels = clf.predict(X_test)
            clf = AdaBoostClassifier(n_estimators=100)
            clf = clf.fit(X_train, y_train)
            pre_labels = clf.predict(X_test)
            # Modeal Evaluation
            ACC = metrics.accuracy_score(y_test, pre_labels)
            MCC = metrics.matthews_corrcoef(y_test, pre_labels)
            SN = self.performance(y_test, pre_labels)
            print ACC, SN
예제 #16
0
def classify(x, y, cv, n_estimator=50):
    acc, prec, recall = [], [], []
    base_clf = DecisionTreeClassifier(
        compute_importances=None,
        criterion="entropy",
        max_depth=1,
        max_features=None,
        max_leaf_nodes=None,
        min_density=None,
        min_samples_leaf=1,
        min_samples_split=2,
        random_state=None,
        splitter="best",
    )

    global clf
    clf = AdaBoostClassifier(base_estimator=base_clf, n_estimators=n_estimator)
    for train, test in cv:
        x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test]
        clf = clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        acc.append(accuracy_score(y_test, y_pred))
        prec.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
    a = np.mean(acc)
    p = np.mean(prec)
    r = np.mean(recall)
    f = 2 * p * r / (p + r)
    return a, p, r, f
예제 #17
0
def runAdaReal(arr):#depth, n_est, filename, lrn_rate=1.0):
    global file_dir, nEvents, solutionFile, counter
    depth = int(arr[0]*100)
    n_est = int(arr[1]*100)
    lrn_rate = arr[2]
    print 'iteration number ' + str(counter)
    counter+=1
    if depth <= 0 or n_est <= 0 or lrn_rate <= 0:
        print 'return 100'
        return 100
    filename =  'adar_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate) # low
    bdt_real = AdaBoostClassifier(
        tree.DecisionTreeClassifier(max_depth=depth),
        n_estimators=n_est,
        learning_rate=lrn_rate)
    print "AdaBoostReal training"
    bdt_real.fit(sigtr[train_input].values,sigtr['Label'].values)
    print "AdaBoostReal testing"
    bdt_real_pred = bdt_real.predict(sigtest[train_input].values)
    solnFile(filename,bdt_real_pred,sigtest['EventId'].values)#
    print "AdaBoostReal finished"
    ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents)
    print ams_score
    logfile.write(filename+': ' + str(ams_score)+'\n')
    return -1.0*float(ams_score)
예제 #18
0
def runAdaBoost(arr):#depth, n_est,  lrn_rate=1.0): # removing filename for the scipy optimise thing '''filename,'''
    #ada = AdaBoostClassifier(n_estimators=100)
    global file_dir, nEvents, solutionFile, counter
    print 'iteration number ' + str(counter)
    counter+=1
    depth = int(arr[0]*100)
    n_est = int(arr[1]*100)
    lrn_rate = arr[2]
    if depth <= 0 or n_est <= 0 or lrn_rate <= 0:
        return 100

    fname = 'ada_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate)
    filename = fname
    ada = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=depth),
                             algorithm="SAMME",
                             n_estimators=n_est)#,n_jobs=4)
    print "AdaBoost training"
    ada.fit(sigtr[train_input].values,sigtr['Label'].values)
    print "AdaBoost testing"
    ada_pred = ada.predict(sigtest[train_input].values)
    solnFile(filename,ada_pred,sigtest['EventId'].values)#
    print "AdaBoost finished"
    # added for teh scipy optimise thing
    ams_score = ams.AMS_metric(solutionFile, file_dir+fname+'.out', nEvents)
    print ams_score
    logfile.write(fname + ': ' + str(ams_score)+'\n')
    return -1.0*float(ams_score) # since we are minimising
예제 #19
0
def plot_adaboost():
    X, y = make_moons(noise=0.3, random_state=0)

    # Create and fit an AdaBoosted decision tree
    est = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                             algorithm="SAMME.R",
                             n_estimators=200)

    sample_weight = np.empty(X.shape[0], dtype=np.float)
    sample_weight[:] = 1. / X.shape[0]

    est._validate_estimator()
    est.estimators_ = []
    est.estimator_weights_ = np.zeros(4, dtype=np.float)
    est.estimator_errors_ = np.ones(4, dtype=np.float)

    plot_step = 0.02

    # Plot the decision boundaries
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    fig, axes = plt.subplots(1, 4, figsize=(14, 4), sharey=True)
    colors = ['#d7191c', '#fdae61', '#ffffbf', '#abd9e9', '#2c7bb6']
    c = lambda a, b, c: map(lambda x: x / 254.0, [a, b, c])
    colors = [c(215, 25, 28),
              c(253, 174, 97),
              c(255, 255, 191),
              c(171, 217, 233),
              c(44, 123, 182),
              ]

    for i, ax in enumerate(axes):
        sample_weight, estimator_weight, estimator_error = est._boost(i, X, y, sample_weight)
        est.estimator_weights_[i] = estimator_weight
        est.estimator_errors_[i] = estimator_error
        sample_weight /= np.sum(sample_weight)

        Z = est.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z,
                    cmap=matplotlib.colors.ListedColormap([colors[1], colors[-2]]),
                    alpha=1.0)
        ax.axis("tight")

        # Plot the training points
        ax.scatter(X[:, 0], X[:, 1],
                   c=np.array([colors[0], colors[-1]])[y],
                   s=20 + (200 * sample_weight) ** 2, cmap=plt.cm.Paired)
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xlabel('$x_0$')

        if i == 0:
            ax.set_ylabel('$x_1$')

    plt.tight_layout()
    plt.show()
예제 #20
0
class AdaBoostcls(object):
    """docstring for ClassName"""
    def __init__(self):
        self.adaboost_cls = AdaBoostClassifier()
        self.prediction = None
        self.train_x = None
        self.train_y = None

    def train_model(self, train_x, train_y):
        try:
            self.train_x = train_x
            self.train_y = train_y
            self.adaboost_cls.fit(train_x, train_y)
        except:
            print(traceback.format_exc())

    def predict(self, test_x):
        try:
            self.test_x = test_x
            self.prediction = self.adaboost_cls.predict(test_x)
            return self.prediction
        except:
            print(traceback.format_exc())

    def accuracy_score(self, test_y):
        try:
            # return r2_score(test_y, self.prediction)
            return self.adaboost_cls.score(self.test_x, test_y)
        except:
            print(traceback.format_exc())
예제 #21
0
def main():

    trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:]
    X = np.array([x[1:8] for x in trainset])
    y = np.array([x[8] for x in trainset])
    #print X,y
    import math
    for i, x in enumerate(X):
        for j, xx in enumerate(x):
            if(math.isnan(xx)):
                X[i][j] = 26.6
   
    
    testset = np.genfromtxt(open('test.csv','r'), delimiter = ',')[1:]

    test = np.array([x[1:8] for x in testset])
    for i, x in enumerate(test):
        for j, xx in enumerate(x):
            if(math.isnan(xx)):
                test[i][j] = 26.6
   

    X, test = decomposition_pca(X, test)

    bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200)
    bdt.fit(X, y)
    


    print 'PassengerId,Survived'
    for i, t in enumerate(test):
        print '%d,%d' % (i + 892, int(bdt.predict(t)[0]))
예제 #22
0
class DomainTypeClassifier(object):
    def __init__(self, radius, window_mode=False):
        self.classifier = AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=2),
            n_estimators=20,
            learning_rate=1,
            algorithm="SAMME")
        # svm.SVC(kernel='rbf')
        self.radius = radius
        self.window_mode = window_mode

    def train(self, dataset):
        k = self.radius if not self.window_mode else 2 * self.radius + 1
        rin, rout = dataset.getData(k, self.window_mode)
        print("fitting", len(rin))
        self.classifier.fit(np.asarray(rin, float), np.asarray(rout, float))

    def predict(self, ns):
        k = self.radius if not self.window_mode else 2 * self.radius + 1
        to_predict = []
        for i in range(len(ns)):
            if not self.window_mode:
                to_predict.append(encode(create_region(ns, i, k)))
            else:
                if i > len(ns) - k:
                    break
                to_predict.append(encode(ns[i:i+k]))
        return int(Counter(self.classifier.predict(
            np.asarray(to_predict, float))).most_common(1)[0][0])
예제 #23
0
def ADA_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting  AdaBoost Classifier***************")
    t0 = time()
    clf = AdaBoostClassifier(n_estimators=300)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("AdaBoost Classifier - {0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)
    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)

    print("***************Ending AdaBoost Classifier***************")
    return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
예제 #24
0
def prediction(feat,label):
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(feat, label, test_size = 0.25, random_state = 0)
    num_leaves = []
    accuracy_score = []
    auc_score = []
    # for depth in range(1,10):
    #     clf = tree.DecisionTreeClassifier(max_depth = depth)
    #     clf.fit(x_train,y_train)
    #     predictions = clf.predict(x_test)
    #     accuracy = clf.score(x_test,y_test)
    #     auc = metrics.roc_auc_score(y_test,predictions)
    #     num_leaves.append(depth)
    #     accuracy_score.append(accuracy)
    #     auc_score.append(auc)

    for depth in range(1,10):
        clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth = depth), n_estimators = 100)
        clf.fit(x_train,y_train)
        predictions = clf.predict(x_test)
        accuracy = clf.score(x_test,y_test)
        auc = metrics.roc_auc_score(y_test,predictions)
        num_leaves.append(depth)
        accuracy_score.append(accuracy)
        auc_score.append(auc)


    return num_leaves,accuracy_score,auc_score
예제 #25
0
def adaBoost(n,x,t,x_test,t_test):
    clf = AdaBoostClassifier(n_estimators = n)
    clf.fit(x, t)
    predictions = clf.predict(x_test)
    X = confusion_matrix(t_test,predictions)
    classificationRate = (X[1,1]+X[0,0]) / sum(sum(X))
    return(1-classificationRate)
예제 #26
0
def ada(xtrain, ytrain, train_weight, tests, test_weight):
    #Initiate the training model
    clf = AdaBoostClassifier()
    mistakes = 0
    cost = 0
    #Fit the model
    clf.fit(xtrain, ytrain)
    vector_count = 0
    #Iterate over the tests
    for i in range(len(tests)):
        #Get the number of elements in each test
        vector_count += len(tests[i])
        test_count = 0
        #Iterate over each feature in the tests
        for vector in tests[i]:
            #Predict based on each feature
            prediction = clf.predict(vector)
            #Determine the cost
            cost += test_weight[i][test_count] * pen[i][prediction[0]]
            #Count the number of mistakes
            if pen[i][prediction[0]] > 0:
                #print("Incorrectly Predicted " + str(Segments.reverse_mapping[i]) + " as " + str(Segments.reverse_mapping[prediction[0]]))
                mistakes += 1
                test_count += 1

    print("Number of mistakes: " + str(mistakes) + " of " + \
            str(vector_count) + ", " + \
            str((1.-float(mistakes)/float(vector_count))*100) + \
            "% accurate")

    return cost
예제 #27
0
class AdaBoost:
    def __init__(self, data, n_estimators=50, learning_rate=1.0):
        features, weights, labels = data
        self.clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None
        self.dataset = split_dataset(features, weights, labels)

    def train(self):
        """
        Train Ada Boost on the higgs dataset
        """
        self.clf = self.clf.fit(self.dataset['training']['features'], self.dataset['training']['labels'])

    def predict(self):
        """
        Predict label using Ada Boost
        :return:
        """
        self.predictions = self.clf.predict(self.dataset['test']['features'])

    def evaluate(self):
        self.trnaccuracy = self.clf.score(self.dataset['training']['features'],
                                          self.dataset['training']['labels'],
                                          sample_weight=self.dataset['training']['weights'])
        self.tstaccuracy = self.clf.score(self.dataset['test']['features'],
                                          self.dataset['test']['labels'],
                                          sample_weight=self.dataset['test']['weights'])
예제 #28
0
def some(X, Y, X_test, Y_test):
    ada = AdaBoostClassifier()
    print "Train Model ---"
    t1 = time()
    ada.fit(X, Y)
    t2 = time()
    print "Model Trained ----------", t2 - t1
    test_errors = []
    cur = 1
    Y_test2 = []
    for k in Y_test:
        Y_test2.append(k[0])
    print "Testing: "
    print  Y_test2
    pred =  ada.predict(X_test)
    print pred
    accu =  1. - accuracy_score(y_true= Y_test2, y_pred= pred)
    print accu
    print "STAGED _____________"
    for test_predict in (
        ada.staged_predict(X_test)):


            test_errors.append(
            1. - accuracy_score(test_predict, Y_test2))


    print  "errorss : "
    print test_errors
예제 #29
0
def trainClassifier(dataDir, trialName, NUMFISH):


    
    ch = circularHOGExtractor(6,4,3) 
    nFeats = ch.getNumFields()+1
    trainData = np.array([])#np.zeros((len(lst0)+len(lst0c)+len(lst1),nFeats))
    targetData = np.array([])#np.hstack((np.zeros(len(lst0)+len(lst0c)),np.ones(len(lst1))))
    for tr in range(NUMFISH):
        directory = dataDir + '/process/' + trialName + '/FR_ID' + str(tr) + '/'
        files = [name for name in os.listdir(directory)]
        thisData = np.zeros((len(files),nFeats))
        thisTarget = tr*np.ones(len(files))
        i = 0
        for imName in files:
            sample = cv2.imread(directory + imName)
            thisIm = cv2.cvtColor(sample, cv2.COLOR_BGR2GRAY)
            
            thisData[i,:] = np.hstack((ch.extract(thisIm), np.mean(thisIm)))
            i = i + 1
        trainData = np.vstack((trainData, thisData)) if trainData.size else thisData
        targetData = np.hstack((targetData, thisTarget)) if targetData.size else thisTarget

    #clf = svm.SVC()
    clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=50)
    y_pred = clf.fit(trainData,targetData)
    pickle.dump(clf, open( dataDir + '/process/' + trialName + '/boost' + trialName + '.p',"wb"))
    y_pred = clf.predict(trainData)
    print("Number of mislabeled points out of a total %d points : %d" % (trainData.shape[0],(targetData != y_pred).sum()))
예제 #30
0
파일: adaboost.py 프로젝트: kbai/uss
def main():
    print("gradient boosting  classifier!")

    X,Y,Xtest = importdata()
    print(Y.shape)
    param_grid={
            "n_estimators":[10,100,200,2000,20000],
            "base_estimator__n_estimators":[10,20,50,100,200],
            "base_estimator__min_samples_split":[5,10,20,50]
            }

    ab=AdaBoostClassifier(RandomForestClassifier())
    Gridsearch_impl(X,Y,ab,param_grid,5)

#    for i in range(10,11,5):
#        clf = DecisionTreeClassifier(min_samples_split=i)
#        rf = RandomForestClassifier(n_estimators = 100,random_state=0,min_samples_split=i)
#        ab = AdaBoostClassifier(rf,n_estimators = 10)
        #ab = GradientBoostingClassifier(n_estimators = 100)
#        score = cross_validation.cross_val_score(ab,X,Y,cv=3)
      #  print(score)
      #  print("average score %f"%np.mean(score))
      #  print("std %f"%np.std(score))
      #  ab.fit(X,Y)
   


    Ytest = ab.predict(Xtest)
    output(Ytest,'submit3.csv')
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(train_notes_set[['Variance','Skewness','Kurtosis','Entropy']], train_notes_set['Class'])  


# In[75]:


clf.feature_importances_


# In[76]:


clf.predict(test_notes_set[['Variance','Skewness','Kurtosis','Entropy']])


# In[60]:


clf.score(test_notes_set[['Variance','Skewness','Kurtosis','Entropy']], test_notes_set['Class'])


# In[61]:


confusion_matrix(test_notes_set['Class'],clf.predict(test_notes_set[['Variance','Skewness','Kurtosis','Entropy']]))


# In[62]:
예제 #32
0
    dummy1.append(get_lexicon_value(text[i], term_Loc[i]))

for i in dummy1:
    val = TextBlob(i).sentiment
    pol1.append(val[0])
    sub1.append(val[1])

df['Pol'] = np.array(pol1).reshape(-1, 1)
df['Sub'] = np.array(sub1).reshape(-1, 1)
# X = df['Pol'].reshape(-1,1)
X1 = df[['Pol', 'Sub']]

Xnew = df[['Pol', 'Sub']]

bdt_discrete.fit(X, Y)
result = bdt_discrete.predict(Xnew)

print(result)

# In[56]:

#outputting the results to the file
#change the file name.

f = open('output.txt', 'a')
id = df.ID

for i in range(len(df['ID'])):
    f.write(str(id[i]) + ";;" + str(result[i]) + '\n')
f.close()
예제 #33
0
파일: codigo.py 프로젝트: nataliafm/AA
        auxp.append(i)

print('Pérdida con un vector de etiquetas aleatorias: ',
      hinge_loss(auxy, auxW))
print('Pérdida con valores obtenidos: ', hinge_loss(auxy, auxp))

#Adaboost
input("Pulse una tecla para pasar al siguiente modelo")
clf = AdaBoostClassifier(n_estimators=25)
clf.fit(trainx, trainy)
print('\nResultados ADABOOST: ')
print('Precisión para los valores de entrenamiento: ',
      clf.score(trainx, trainy))
print('Precisión para los valores de prueba: ', clf.score(testx, testy))

pred = clf.predict(testx)

auxp = []
for i in pred:
    if i == 0:
        auxp.append(-1)
    else:
        auxp.append(i)

print('Pérdida con un vector de etiquetas aleatorias: ',
      hinge_loss(auxy, auxW))
print('Pérdida con valores obtenidos: ', hinge_loss(auxy, auxp))

#Random Forest
input("Pulse una tecla para pasar al siguiente modelo")
rf = RandomForestClassifier(n_estimators=1,
# random forest
pred_scores = []
for i in range(2, 36):
    rfc = RandomForestClassifier(n_estimators=i, random_state=111)
    rfc.fit(x_train1, y_train1)
    pred = rfc.predict(x_test1)
    pred_scores.append((i, [accuracy_score(y_test1, pred)]))
df = pd.DataFrame.from_items(pred_scores, orient='index', columns=['Score'])
df[df['Score'] == df['Score'].max()]

# AdaBoosting
pred_scores = []
for i in range(25, 76):
    abc = AdaBoostClassifier(n_estimators=i, random_state=111)
    abc.fit(x_train1, y_train1)
    pred = abc.predict(x_test1)
    pred_scores.append((i, [accuracy_score(y_test1, pred)]))
df = pd.DataFrame.from_items(pred_scores, orient='index', columns=['Score'])
df[df['Score'] == df['Score'].max()]

# bagging
pred_scores = []
for i in range(2, 21):
    bc = BaggingClassifier(n_estimators=i, random_state=111)
    bc.fit(x_train1, y_train1)
    pred = bc.predict(x_test1)
    pred_scores.append((i, [accuracy_score(y_test1, pred)]))
df = pd.DataFrame.from_items(pred_scores, orient='index', columns=['Score'])
df[df['Score'] == df['Score'].max()]

# TfidfVectorizer, tune parapmeters for each algorithm
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=y)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
tree = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=1)
ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=500,
                         learning_rate=0.1,
                         random_state=1)

from sklearn.metrics import accuracy_score
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f' %
      (tree_train, tree_test))

# AdaBoost Accuracy
ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test))
예제 #36
0
data = pd.read_csv(filename).dropna()
feature_name = data.columns[2:-1]
data = data.values

seed(0)
name = data[:, 0]
y = data[:, 1] == 'EUROPE'
# make class labels +-1
y = y.astype('int') * 2 - 1
X = data[:, 2:].astype('float')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

algo = LogisticRegression()
model = AdaBoostClassifier(base_estimator=algo, n_estimators=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_pred, y_test))

model.estimator_weights_
log_w_i = np.zeros(len(y_train), dtype='float')
for i in range(len(log_w_i)):
    item = np.reshape(X_train[i, :], (1, 18))
    for j in range(len(model.estimators_)):
        y_pred = model.estimators_[j].predict(item)
        alpha = model.estimator_weights_[j]
        log_w_i[i] -= alpha * y_train[i] * y_pred[0]

w_i = np.exp(log_w_i)
w_i = w_i / np.sum(w_i)
not_outliers = w_i < np.mean(
    np.sort(w_i)[:-20]) + 3 * np.std(np.sort(w_i)[:-20])
n = 25
clf1 = ExtraTreesClassifier(n_estimators=n)
#print "Beginning model training."

clf1.fit(X_train, y_train)
#print "Model training completed."

# Use the trained classifier to make predictions on the test data
predictions_etree = clf1.predict(X_test)
#print "Predictions on testing data computed."

# Print the accuracy (percentage of phishing websites correctly predicted)
accuracy = 100.0 * accuracy_score(y_test, predictions_etree)
print "The accuracy of your decision tree on testing data is: " + str(accuracy)
print
print "===============AdaBoost===============\n"
clf2 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n)
#print "Beginning model training."

clf2.fit(X_train, y_train)
#print "Model training completed."

# Use the trained classifier to make predictions on the test data
predictions = clf2.predict(X_test)
#print "Predictions on testing data computed."

# Print the accuracy (percentage of phishing websites correctly predicted)
accuracy = 100.0 * accuracy_score(y_test, predictions)
print "The accuracy of your decision tree on testing data is: " + str(accuracy)
print
예제 #38
0
def main():

    # load pickle
    arxiv_11 = pickle.load(open("2011_big_pop.p", "rb"))
    arxiv_12 = pickle.load(open("2012_big_pop.p", "rb"))

    print "loaded pickles"

    # build doc set
    doc_set = arxiv_11['astro'] + arxiv_11['cond'] + \
              arxiv_11['cs'] + arxiv_11['hep'] + \
              arxiv_11['math'] + arxiv_11['physics'] + \
              arxiv_11['quant'] + arxiv_11['stat']
    label_set = [1]*len(arxiv_11['astro']) + [2]*len(arxiv_11['cond']) + \
                [3]*len(arxiv_11['cs']) + [4]*len(arxiv_11['hep']) + \
                [5]*len(arxiv_11['math']) + [6]*len(arxiv_11['physics']) + \
                [7]*len(arxiv_11['quant']) + [8]*len(arxiv_11['stat'])

    # list for tokenized documents in loop
    texts = tokenize(doc_set)

    # turn our tokenized documents into a id - term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    num_topics = 450
    ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                               num_topics=num_topics,
                                               id2word=dictionary,
                                               passes=20)

    print "LDA built"

    # print(ldamodel.print_topics(num_topics=2, num_words=3))

    # look at topic proportion of one document
    # print ldamodel[dictionary.doc2bow(texts[0])]

    # build topic proportion matrix
    topicPropArray = np.zeros((len(texts), num_topics))
    for i in range(len(texts)):
        text = texts[i]
        textProp = ldamodel[dictionary.doc2bow(text)]
        for pair in textProp:
            topicIdx = pair[0]
            weight = pair[1]
            topicPropArray[i, topicIdx] = weight

    # print topicPropArray

    print "matrix built"
    print "------------------"
    print "testing"

    # test on new data
    test_set = arxiv_12['astro'][0:99] + arxiv_12['cond'][0:99] + \
              arxiv_12['cs'][0:99] + arxiv_12['hep'][0:99] + \
              arxiv_12['math'][0:99] + arxiv_12['physics'][0:99] + \
              arxiv_12['quant'][0:99] + arxiv_12['stat'][0:99]
    print "test_set length : " + str(len(test_set))
    test_label = [1]*100 + [2]*100 + [3]*100 + [4]*100 + [5]*100 + \
                 [6]*100 + [7]*100 + [8]*100
    print "test_label length : " + str(len(test_label))
    test_texts = tokenize(test_set)

    # build test features
    testPropArray = np.zeros((800, num_topics))
    for i in range(len(test_texts)):
        test = test_texts[i]
        testProp = ldamodel[dictionary.doc2bow(test)]
        for pair in testProp:
            topicIdx = pair[0]
            weight = pair[1]
            testPropArray[i, topicIdx] = weight

    # all testing
    X_train, X_test, y_train, y_test = topicPropArray, testPropArray, label_set, test_label

    print "training_array length: " + str(len(topicPropArray))
    print "test_array length: " + str(len(testPropArray))
    print "training_label length: " + str(len(label_set))
    print "test_label length: " + str(len(test_label))
    print '--------------------------------'

    # knn3
    knn3 = KNeighborsClassifier(n_neighbors=3)
    knn3.fit(X_train, y_train)
    predictions = knn3.predict(X_test)
    cm = confusion_matrix(y_test,
                          predictions,
                          labels=['1', '2', '3', '4', '5', '6', '7', '8'])
    np.savetxt('knn3pred.csv',
               predictions.astype(int),
               fmt='%i',
               delimiter=",")
    np.savetxt('knn3cm.txt', cm.astype(int), fmt='%i', delimiter=",")
    # print predictions
    print 'knn3'
    print zero_one_loss(predictions, y_test)
    print '--------------------------------'

    # knn5
    knn5 = KNeighborsClassifier(n_neighbors=5)
    knn5.fit(X_train, y_train)
    predictions = knn5.predict(X_test)
    cm = confusion_matrix(y_test,
                          predictions,
                          labels=['1', '2', '3', '4', '5', '6', '7', '8'])
    np.savetxt('knn5pred.csv',
               predictions.astype(int),
               fmt='%i',
               delimiter=",")
    np.savetxt('knn5cm.txt', cm.astype(int), fmt='%i', delimiter=",")
    # print predictions
    print 'knn5'
    print zero_one_loss(predictions, y_test)
    print '--------------------------------'

    # svmlin
    svmlin = svm.SVC(kernel='linear')
    svmlin.fit(X_train, y_train)
    predictions = svmlin.predict(X_test)
    cm = confusion_matrix(y_test,
                          predictions,
                          labels=['1', '2', '3', '4', '5', '6', '7', '8'])
    np.savetxt('svmlinpred.csv',
               predictions.astype(int),
               fmt='%i',
               delimiter=",")
    np.savetxt('svmlincm.txt', cm.astype(int), fmt='%i', delimiter=",")
    # print predictions
    print 'svmlin'
    print zero_one_loss(predictions, y_test)
    print '--------------------------------'

    # gnb
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predictions = gnb.predict(X_test)
    cm = confusion_matrix(y_test,
                          predictions,
                          labels=['1', '2', '3', '4', '5', '6', '7', '8'])
    np.savetxt('gnbpred.csv', predictions.astype(int), fmt='%i', delimiter=",")
    np.savetxt('gnbcm.txt', cm.astype(int), fmt='%i', delimiter=",")
    # print predictions
    print 'gnb'
    print zero_one_loss(predictions, y_test)
    print '--------------------------------'

    # rf50
    rf50 = RandomForestClassifier(n_estimators=50)
    rf50.fit(X_train, y_train)
    predictions = rf50.predict(X_test)
    cm = confusion_matrix(y_test,
                          predictions,
                          labels=['1', '2', '3', '4', '5', '6', '7', '8'])
    np.savetxt('rf50pred.csv',
               predictions.astype(int),
               fmt='%i',
               delimiter=",")
    np.savetxt('rf50cm.txt', cm.astype(int), fmt='%i', delimiter=",")
    # print predictions
    print 'rf50'
    print zero_one_loss(predictions, y_test)
    print '--------------------------------'

    # dtree ada
    ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                             n_estimators=400,
                             learning_rate=1,
                             algorithm="SAMME",
                             random_state=None)
    n_estimators = 400
    ada.fit(X_train, y_train)
    predictions = ada.predict(X_test)
    cm = confusion_matrix(y_test,
                          predictions,
                          labels=['1', '2', '3', '4', '5', '6', '7', '8'])
    np.savetxt('adapred.csv', predictions.astype(int), fmt='%i', delimiter=",")
    np.savetxt('adacm.txt', cm.astype(int), fmt='%i', delimiter=",")
    # print predictions
    print 'ada'
    print zero_one_loss(predictions, y_test)
    print '--------------------------------'
예제 #39
0
merge_data = np.concatenate([train_output, train_data], axis=1)
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5),
                   algorithm="SAMME",
                   n_estimators=200, learning_rate=0.8)
bdt.fit(merge_data, train_lable)


#---------------------------------
#-------------test----------------
test_data = []
test_lable = []
for i in range(len(test_list)):
    if i not in random_num:
        test_data.append(x_train[test_list[i]])
        test_lable.append(y_train[test_list[i]])

test_data = np.array(test_data)
test_lable = np.array(test_lable)
print('train_data:',train_data.shape)

encode_train_output = K.function([model.layers[0].input], [model.layers[2].output])
test_output = encode_train_output([test_data, 0])[0]
print('encode_output:',test_output.shape)
print(test_output)

merge_data = np.concatenate([test_output, test_data], axis=1)
pred = bdt.predict(merge_data)
print(pred)
Evaluate_Function.Evaluate_Fun(pred, test_lable, merge_data)
예제 #40
0
#mnb.fit(x_train,y_train)
knc.fit(x_train, y_train)
dtc.fit(x_train, y_train)
rfc.fit(x_train, y_train)
gbc.fit(x_train, y_train)
abc.fit(x_train, y_train)
svc.fit(x_train, y_train)
gnb.fit(x_train, y_train)
LR.fit(x_train, y_train)

#y_predict_mnb=mnb.predict(x_test)
y_predict_knc = knc.predict(x_test)
y_predict_dtc = dtc.predict(x_test)
y_predict_rfc = rfc.predict(x_test)
y_predict_gbc = gbc.predict(x_test)
y_predict_abc = abc.predict(x_test)
y_predict_svc = svc.predict(x_test)
y_predict_gnb = gnb.predict(x_test)
y_predict_lr = LR.predict(x_test)

from sklearn.metrics import classification_report

print('\n1:')
print("DTC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_dtc))
print("\nKNC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_knc))
print("\nRFC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_rfc))
print("\nGBC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_gbc))
print("\nAda confusioin_matrix:\n", confusion_matrix(y_test, y_predict_abc))
print("\nSVC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_svc))
print("\nGauNB confusioin_matrix:\n", confusion_matrix(y_test, y_predict_gnb))
print("\nLR confusioin_matrix:\n", confusion_matrix(y_test, y_predict_lr))
예제 #41
0
# --------------
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Code starts here
#print("--"*25,"X_train")
#print(X_train)
#print("--"*25,"X_test")
#print(X_test)
#print("--"*25,"y_train")
#print(y_train)
#print("--"*25,"y_test")
#print(y_test)
ada_model = AdaBoostClassifier(random_state=0)
ada_model.fit(X_train, y_train)
y_pred = ada_model.predict(X_test)
ada_score = accuracy_score(y_test, y_pred)
ada_cm = confusion_matrix(y_test, y_pred)
ada_cr = classification_report(y_test, y_pred)
print("Accuracy score :", ada_score)
print("Cdafusion Matrix :", ada_cm)
print("Classification Report :", ada_cr)

# --------------
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

#Parameter list
parameters = {
    'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
    'max_depth': range(1, 3)
def fxn():
    #read in the data
    df = pd.read_csv('data.csv')

    #columns to drop
    df = df.drop(['id'], axis=1)

    df.sample(frac=1)
    #gets rid of ? and one hot encoding for all columns that need it
    index = []
    count = 0
    for val in range(len(df.ix[:, 0])):
        flag = False
        for column in df:
            if df[column][val] == '?':
                flag = True
                break
        if flag:
            continue
        if count < 1000:
            index.append(val)
            count += 1
    df = df[df.index.isin(index)]

    #gets all columns which are not ints and integer encodes them
    obj_df = df.select_dtypes(include=['object']).copy()
    for column in obj_df:
        le = preprocessing.LabelEncoder()
        le.fit(df[column])
        df[column] = le.transform(df[column])

    #normalize all points between [0,1]
    x = df.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)

    # In[589]:

    #make dataset only 1100
    #create 500/500 split between labelled on nonlablled array, 1000 semi-sup data set, and 100 validation dataset
    train, test = np.split(df.sample(frac=1), [int(.8 * len(df))])
    #print(train)
    train = train.values.tolist()
    test = test.values.tolist()

    df_unsupervised = []

    label_nolabels = {}
    for point in train:
        #unlablled 1000 points data
        df_unsupervised.append(point[1:])
        label_nolabels[tuple(point[1:])] = [point[0]]

    # In[590]:

    ##### #kmeans_forest 1-10, unsupervised learning adaboosting
    # kmeans1 = KMeans(n_clusters=2).fit(df_unsupervised)
    # # #kmeans2 = SpectralClustering(n_clusters = 2).fit_predict(df_unsupervised).tolist()
    # # kmeans3 = MeanShift().fit(df_unsupervised)
    # # #kmeans4 = AgglomerativeClustering(n_clusters=2).fit_predict(df_unsupervised).tolist()
    # # kmeans5 = DBSCAN().fit_predict(df_unsupervised).tolist()
    # # kmeans6 = GaussianMixture(n_components=2).fit(df_unsupervised)
    # # kmeans7 = Birch(n_clusters=2).fit(df_unsupervised)
    # # kmeans8 = BayesianGaussianMixture(n_components=2).fit(df_unsupervised)
    # classifiers = [kmeans1, kmeans3, kmeans5, kmeans6, kmeans7, kmeans8]
    #kmeans_forest 1-10, unsupervised learning adaboosting
    kmeans1 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans2 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans3 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans4 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans5 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans6 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans7 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans8 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans9 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans10 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans11 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans12 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans13 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans14 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans15 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans16 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans17 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans18 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans19 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans20 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans21 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans22 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans23 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans24 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans25 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans26 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans27 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans28 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans29 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans30 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans31 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans32 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans33 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans34 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans35 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans36 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans37 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans38 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans39 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans40 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans41 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans42 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans43 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans44 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans45 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans46 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans47 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans48 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans49 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans50 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    classifiers = [
        kmeans1, kmeans2, kmeans3, kmeans4, kmeans5, kmeans6, kmeans7, kmeans8,
        kmeans9, kmeans10, kmeans11, kmeans12, kmeans13, kmeans14, kmeans15,
        kmeans16, kmeans17, kmeans18, kmeans19, kmeans20, kmeans21, kmeans22,
        kmeans23, kmeans24, kmeans25, kmeans26, kmeans27, kmeans28, kmeans29,
        kmeans30, kmeans31, kmeans32, kmeans33, kmeans34, kmeans35, kmeans36,
        kmeans37, kmeans38, kmeans39, kmeans40, kmeans41, kmeans42, kmeans43,
        kmeans44, kmeans45, kmeans46, kmeans47, kmeans48, kmeans49, kmeans50
    ]

    # In[591]:

    # make csv in form of rowNumber, clfNumber, clf prediction on that row
    answers = []
    for point in range(len(df_unsupervised)):
        for clf in range(len(classifiers)):
            answers.append([
                point, clf, classifiers[clf].predict([df_unsupervised[point]])
            ])

    count = 0
    f = open("answer_file.csv", "w")
    f.write('question,worker,answer;\n')
    for answer in answers:
        count += 1
        f.write(
            str(answer[0]) + ',' + str(answer[1]) + ',' + str(int(answer[2])) +
            '\n')
    f.close()
    p = open("result_file.csv", "w")
    p.close()

    # In[592]:

    #run VI BP
    import subprocess
    subprocess.call([
        "python", "run.py", "methods/c_EM/method.py", "answer_file.csv",
        "result_file.csv", "decision-making"
    ])

    # In[593]:

    #extract results, get noisy labels and
    filepath = "result_file.csv"
    noisy_labels = []
    with open(filepath) as fp:
        for line in fp:
            questionAnswer = line.split(',')
            noisy_labels.append(questionAnswer)

    # In[594]:

    #assign noisy label to proper row
    df_noise_x = []
    df_noise_y = []
    for question in noisy_labels:
        if question[0].rstrip() == 'question':
            continue
        df_noise_x += [df_unsupervised[int(question[0].rstrip())]]
        df_noise_y.append(int(question[1].rstrip()))
    count_vi = 0
    for el in range(len(df_noise_x)):
        if label_nolabels[tuple(df_noise_x[el])][0] != df_noise_y[el]:
            count_vi += 1
    print(count_vi, len(df_noise_x))

    # In[595]:

    df_noise_y2 = []
    for el in df_noise_y:
        df_noise_y2.append(int(el))

    df_noise = []
    for el in range(len(df_noise_x)):
        new = df_noise_x[el]
        new.append(df_noise_y2[el])
        df_noise.append(new)

    #need to shuffle the data
    random.shuffle(df_noise)

    df_noise_x = []
    df_noise_y = []
    for row in df_noise:
        df_noise_x.append(row[:-1])
        df_noise_y.append(row[-1:][0])

    # In[596]:

    #run AdaBoost from Sklearn on noisy data
    bdt2 = AdaBoostClassifier(DecisionTreeClassifier(),
                              algorithm="SAMME",
                              n_estimators=20)
    bdt2.fit(df_noise_x, df_noise_y)

    # In[597]:

    #Ada boosting on noisy data error rate
    errors = []
    count1 = 0
    for point in test:
        est = bdt2.predict([point[:-1]])
        true = int(point[-1:][0])
        est = int(est[0])
        if est == true:
            errors.append([point[:-1], 0])
        else:
            count1 += 1
            errors.append([point[:-1], 1])

            # error rate, noisy -> baseline
    return (count1 / len(test))
예제 #43
0
# Random Forest Classifier (bootstrap aggreagted decision trees)
RFmodel = RandomForestClassifier(max_depth=4, random_state=0)
RFmodel.fit(X_train, y_train)

# print feature importances, the higher the number the more important
print(RFmodel.feature_importances_)

# print Random Forest prediction accuracy score
RFpred = RFmodel.predict(X_test)
print(accuracy_score(y_test, RFpred) * 100)

cm = pd.DataFrame(confusion_matrix(y_test, RFpred))
print(cm)

# AdaBoost (Boosted Tree)
ABmodel = AdaBoostClassifier()
ABmodel.fit(X_train, y_train)
ABpred = ABmodel.predict(X_test)
print(accuracy_score(y_test, ABpred) * 100)

# Compare Decision Tree, Random Forest, AdaBoost
DTtest = accuracy_score(y_test, pred) * 100
RFtest = accuracy_score(y_test, RFpred) * 100
ABtest = accuracy_score(y_test, ABpred) * 100

print("Prediction Accuracy Scores:")
print("Decision Tree: ", DTtest)
print("Random Forest: ", RFtest)
print("AdaBoost: ", ABtest)
예제 #44
0
y = df.Class

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

tfidf_vect = TfidfVectorizer(strip_accents=None,
                             lowercase=False,
                             preprocessor=None,
                             tokenizer=tokenizer_porter,
                             use_idf=True,
                             norm='l2',
                             smooth_idf=True,
                             stop_words=spanish_stopwords)

tfidf_train = tfidf_vect.fit_transform(x_train)
tfidf_test = tfidf_vect.transform(x_test)
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vect.get_feature_names())

Adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10),
                          n_estimators=5,
                          random_state=1)
Adab.fit(tfidf_train, y_train)

y_pred3 = Adab.predict(tfidf_test)
ABscore = metrics.accuracy_score(y_test, y_pred3)
print("accuracy: %0.3f" % ABscore)

DecTree = open('DecTree.sav', 'wb')
pickle.dump(Adab, DecTree)
DecTree.close()

# Accuracy: 0.777
예제 #45
0
    dataMat = []
    labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = []
        curLine = line.strip().split('\t')
        for i in range(numFeat - 1):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat, labelMat


if __name__ == '__main__':
    dataArr, classLabels = loadDataSet(
        'D:\Project\Machinelearning\Logistic\horseColicTraining.txt')
    testArr, testLabelArr = loadDataSet(
        'D:\Project\Machinelearning\Logistic\horseColicTest.txt')
    bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                             algorithm="SAMME",
                             n_estimators=10)
    bdt.fit(dataArr, classLabels)
    predictions = bdt.predict(dataArr)
    errArr = np.mat(np.ones((len(dataArr), 1)))
    print('训练集的错误率:%.3f%%' %
          float(errArr[predictions != classLabels].sum() / len(dataArr) * 100))
    predictions = bdt.predict(testArr)
    errArr = np.mat(np.ones((len(testArr), 1)))
    print(
        '测试集的错误率:%.3f%%' %
        float(errArr[predictions != testLabelArr].sum() / len(testArr) * 100))
예제 #46
0
def train_bdt():
    print("Loading data...")
    if SMALL_DATA:
        signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data_small()
    else:
        signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data()

    # print("Sampling 10% of the data for training")
    # #Create smaller samples, 10% of the size
    # signal = np.asarray(random.sample(signal, int((len(signal))*0.1)))
    # bkg2nu = np.asarray(random.sample(bkg2nu, int((len(bkg2nu))*0.1)))
    # bkg214Bi = np.asarray(random.sample(bkg214Bi, int((len(bkg214Bi))*0.1)))
    # bkg208Tl = np.asarray(random.sample(bkg208Tl, int((len(bkg208Tl))*0.1)))
    # bkgRn = np.asarray(random.sample(bkgRn, int((len(bkgRn))*0.1)))

    print("Creating arrays...")
    # X = Features (i.e. the data)
    X = np.concatenate((signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn))

    # y = Labels (i.e. what it is, signal / background)
    y = np.concatenate(
        (np.ones(signal.shape[0]), np.zeros(bkg2nu.shape[0]),
         np.zeros(bkg214Bi.shape[0]), np.zeros(bkg208Tl.shape[0]),
         np.zeros(bkgRn.shape[0])))

    print("Splitting Data...")
    # Split the data
    X_dev, X_eval, y_dev, y_eval = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=48)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    # print("Oversampling...")
    # # Oversample to improve representation of backgrounds
    # ros = RandomOverSampler(random_state=0)
    # X_resampled, y_resampled = ros.fit_sample(X_train, y_train)
    # X_test_resampled, y_test_resampled = ros.fit_sample(X_test, y_test)
    # X_dev_resampled, y_dev_resampled = ros.fit_sample(X_dev, y_dev)
    # X_eval_resampled, y_eval_resampled = ros.fit_sample(X_eval, y_eval)
    # print(sorted(Counter(y_resampled).items()))

    print("Removing weights..")
    # Remove weights on backgrounds (will be passed in to the BDT later)
    # 30/09/19 - removed re sampling
    X_train_weights = X_train[:, 6]
    X_train_new = np.delete(X_train, 6, axis=1)
    X_test_new = np.delete(X_test, 6, axis=1)

    X_dev_weights = X_dev[:, 6]
    X_dev_new = np.delete(X_dev, 6, axis=1)
    X_eval_new = np.delete(X_eval, 6, axis=1)

    print("Creating classifier for DT")
    # Create classifiers
    dt = DecisionTreeClassifier(max_depth=12,
                                min_samples_split=0.5,
                                min_samples_leaf=400)

    print("Creating classifier for BDT")
    bdt = AdaBoostClassifier(dt,
                             algorithm='SAMME',
                             n_estimators=1200,
                             learning_rate=0.5)

    print("Fitting BDT...")
    # Train the classifier - pass in weights from earlier
    fitted_tree = bdt.fit(X_train_new, y_train, sample_weight=X_train_weights)

    print("Predicting on training data...")
    # Use the fitted tree to predict on training data and new test data
    y_predicted_train = bdt.predict(X_train_new)

    print("Predicting on test data...")
    y_predicted_test = bdt.predict(X_test_new)

    print(
        classification_report(y_train,
                              y_predicted_train,
                              target_names=["signal", "background"]))
    print("Area under ROC curve for training data: {0:.4f}".format(
        roc_auc_score(y_train, bdt.decision_function(X_train_new))))

    print(
        classification_report(y_test,
                              y_predicted_test,
                              target_names=["signal", "background"]))
    print("Area under ROC curve for test data: {0:.4f}".format(
        roc_auc_score(y_test, bdt.decision_function(X_test_new))))

    plot_roc_curve(bdt, X_test_new, y_test)
    compare_train_test(bdt, X_train_new, y_train, X_test_new, y_test)

    print("Saving classifier...")
    save_path = BASE_PATH + 'ml_calculated_data/weight/'
    dump(bdt, save_path + 'bdt_classifier.joblib')
    dump(fitted_tree, save_path + 'bdt_fitted_tree.joblib')
    dump(X_train_new, save_path + 'bdt_X_train_new.joblib')
    dump(X_test_new, save_path + 'bdt_X_test_new.joblib')
    dump(X_dev_new, save_path + 'bdt_X_dev_new.joblib')
    dump(X_dev_weights, save_path + 'bdt_X_dev_weights.joblib')
    dump(X_eval_new, save_path + 'bdt_X_eval_new.joblib')
    dump(y_test, save_path + 'bdt_y_test.joblib')
    dump(y_train, save_path + 'bdt_y_train.joblib')
    dump(y_dev, save_path + 'bdt_y_dev.joblib')
    dump(y_eval, save_path + 'bdt_y_eval.joblib')

    print("Finished Training.")
예제 #47
0
# clf = KNeighborsClassifier(n_neighbors= 5)

### Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
'''
Best model tested:
clf = AdaBoostClassifier(n_estimators= 8, learning_rate = 0.7, random_state= 1)
features = ['expenses', 'exercised_stock_options', 'other', 'from_ratio']
P: 0.63 | R: 0.45
'''

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

clf.fit(features_train, labels_train)

pred = clf.predict(features_test)

### Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
예제 #48
0
def train_bdt_multiclass():
    print("Loading data...")
    if SMALL_DATA:
        signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data_small()
    else:
        signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data()

    print("Creating arrays...")
    # X = Features (i.e. the data)
    X = np.concatenate((signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn))

    # y = Labels (i.e. what it is, signal / background)
    y = np.concatenate((np.ones(signal.shape[0]), np.full(bkg2nu.shape[0], 2),
                        np.full(bkg214Bi.shape[0],
                                3), np.full(bkg208Tl.shape[0],
                                            4), np.full(bkgRn.shape[0], 5)))

    print("Splitting Data...")
    # Split the data
    X_dev, X_eval, y_dev, y_eval = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=48)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    print("Creating classifier for DT")
    # Create classifiers
    dt = DecisionTreeClassifier(max_depth=12,
                                min_samples_split=0.5,
                                min_samples_leaf=400)

    print("Creating classifier for BDT")
    bdt = AdaBoostClassifier(dt,
                             algorithm='SAMME',
                             n_estimators=1200,
                             learning_rate=0.5)

    print("Fitting BDT...")
    # Train the classifier - not using weights here as it is a multiclassifier
    fitted_tree = bdt.fit(X_train, y_train)

    print("Predicting on training data...")
    # Use the fitted tree to predict on training data and new test data
    y_predicted_train = bdt.predict(X_train)

    print("Predicting on test data...")
    y_predicted_test = bdt.predict(X_test)

    print(
        classification_report(
            y_train,
            y_predicted_train,
            target_names=["signal", "2nu", "214Bi", "208Tl", "Radon"]))
    print("Area under ROC curve for training data: {0:.4f}".format(
        roc_auc_score(y_train,
                      bdt.predict_proba(X_train),
                      average="weighted",
                      multi_class="ovr")))

    print(
        classification_report(
            y_test,
            y_predicted_test,
            target_names=["signal", "2nu", "214Bi", "208Tl", "Radon"]))
    print("Area under ROC curve for test data: {0:.4f}".format(
        roc_auc_score(y_test,
                      bdt.predict_proba(X_test),
                      average="weighted",
                      multi_class="ovr")))

    plot_roc_curve(bdt, X_test, y_test)
    compare_train_test_multi(bdt, X_train, y_train, X_test, y_test)

    print("Saving classifier...")
    save_path = BASE_PATH + 'ml_calculated_data/multiClass/'
    dump(bdt, save_path + 'bdt_classifier.joblib')
    dump(fitted_tree, save_path + 'bdt_fitted_tree.joblib')
    dump(X_train, save_path + 'bdt_X_train.joblib')
    dump(X_test, save_path + 'bdt_X_test.joblib')
    dump(X_dev, save_path + 'bdt_X_dev.joblib')
    dump(X_eval, save_path + 'bdt_X_eval.joblib')
    dump(y_test, save_path + 'bdt_y_test.joblib')
    dump(y_train, save_path + 'bdt_y_train.joblib')
    dump(y_dev, save_path + 'bdt_y_dev.joblib')
    dump(y_eval, save_path + 'bdt_y_eval.joblib')

    print("Finished Training.")
예제 #49
0
#Checking the R2_score
print("R2 score of the model is ", r2_score(y_test, y_pred4))
print("Negative value shows the model doesn't follow a linear trend")
#Classification Report
print("Classification report is given as ")
print(classification_report(y_test, y_pred4))
#F1-score
print("F1-score of the model is ", f1_score(y_test, y_pred4))

#Useing of adaboost classifier
from sklearn.ensemble import AdaBoostClassifier
classifier_ada = AdaBoostClassifier(n_estimators=1000,
                                    learning_rate=1,
                                    random_state=0)
classifier_ada.fit(X_train, y_train)
y_pred_ada = classifier_ada.predict(X_test)
print("accuracy of adaboost classifier is ",
      accuracy_score(y_test, y_pred_ada))

#Hyperparameter tuning of Adaboost Classifier
parameters_ada = [{
    'n_estimators': [100, 200, 250, 1000, 10000],
    'learning_rate': [0.1, 0.2, 0.5, 0.8, 1, 2]
}]

grid_search_ada = GridSearchCV(estimator=classifier_ada,
                               param_grid=parameters_ada,
                               scoring='accuracy',
                               cv=10,
                               n_jobs=-1)
grid_search_ada.fit(X_train, y_train)
예제 #50
0
# In[74]:

rfc = RandomForestClassifier(n_estimators=100, criterion='entropy')
rfc = grid.fit(xtr, ytr)
print(confusion_matrix(yte, rfc.predict(xte)))

# In[69]:

#lets use adaboost

# In[71]:

from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(xtr, ytr)
print(confusion_matrix(yte, clf.predict(xte)))

# In[72]:

#lets use gradientboost

# In[73]:

from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(xtr, ytr)
print(confusion_matrix(yte, gbc.predict(xte)))

# In[76]:

#svc
예제 #51
0
    #     param_grid={"n_estimators": range(500, 1501, 100)}, cv=3)
    # # 拟合训练数据集
    # model.fit(train_X, train_Y)
    # print "最好的参数是:%s, 此时的得分是:%0.2f" % (model.best_params_, model.best_score_)
    model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
        splitter='random',
        max_features=90,
        max_depth=50,
        min_samples_split=6,
        min_samples_leaf=3),
                               n_estimators=1200,
                               learning_rate=0.005)
    # 拟合训练数据集
    model.fit(train_X, train_Y)
    # 预测训练集
    train_Y_hat = model.predict(train_X[idx])
    print "训练集精确度: ", accuracy_score(train_Y[idx], train_Y_hat)
    # 预测测试集
    test_Y_hat = model.predict(test_X)
    print "测试集精确度: ", accuracy_score(test_Y, test_Y_hat)
    print "总耗时:", time() - t, "秒"
    # 绘制ROC曲线
    n_class = len(np.unique(train_Y))
    roc.drawROC(n_class, test_Y, test_Y_hat)

    # 读取CCPP数据集, 测试AdaBoost的回归模型
    data = pd.read_excel("data/CCPP/Folds5x2_pp.xlsx")
    # AT:温度, V:压力, AP:湿度, RH:压强, PE:输出电力
    # 样本特征X
    X = data[['AT', 'V', 'AP', 'RH']]
    # 数据归一化
accuracies['NB'] = totalScore
logLosses['NB'] = totalLogLoss

#predict category on testData
test = pd.read_csv(testLocation)

#predict clusters for test data
data2 = test.get(['X', 'Y'])
test_cluster_predict = est.predict(data2)
test['cluster_ids'] = test_cluster_predict

test_features, _ = dataMassaging(test)

#RF predict
classifier1 = RandomForestClassifier()
classifier1.set_params(min_samples_split=1000)
classifier1.fit(features, classes)
predictions1 = classifier1.predict(test_features)
visualizePrediction(predictions1)

#AdaBoost predictx
classifier2 = AdaBoostClassifier(n_estimators=50)
classifier2.fit(features, classes)
predictions2 = classifier2.predict(test_features)
visualizePrediction(predictions2)

#NB predict
classifier3 = GaussianNB()
classifier3.fit(features, classes)
predictions3 = classifier3.predict(test_features)
visualizePrediction(predictions3)
예제 #53
0
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_gaussian_quantiles

X1, y1 = make_gaussian_quantiles(cov=2.0,n_samples=500, n_features=2,n_classes=2, random_state=1)
X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5,n_samples=400, n_features=2, n_classes=2, random_state=1)

#将两组数据合成一组数据
X = np.concatenate((X1, X2))
y = np.concatenate((y1, -y2 + 1))

# plt.plot(X,y)
# plt.show()

#创建分类器
bdt=AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5),algorithm="SAMME",n_estimators=200, learning_rate=0.8)

bdt.fit(X,y)

x_min, x_max= X[:, 0].min() -1, X[:, 0].max() + 1
y_min, y_max= X[:, 1].min() -1, X[:, 1].max() + 1
xx, yy= np.meshgrid(np.arange(x_min, x_max, 0.02),np.arange(y_min, y_max, 0.02))

Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
plt.show()
예제 #54
0
                                                    random_state=24)

#NOTE: change classifier here
clf = AdaBoostClassifier(n_estimators=500, algorithm='SAMME')

#training
st = time.time()
print "training started"
clf.fit(x_train, y_train)
print "training ended"
et = time.time()
tt = et - st
print "Training Time = " + str(tt) + "\n"

#predictions
pred = clf.predict(x_test)
#NOTE: change to decision_function or predict_proba depending on the classifier
y_score = clf.predict_proba(x_test)
#y_score = clf.decision_function(x_test)

#################################################################################
pp = PdfPages('results/EXP_Result.pdf')
#PrecisionRecall-plot
precision = dict()
recall = dict()
PR_area = dict()
PR_thresholds = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], PR_thresholds[i] = precision_recall_curve(
        y_test[:, i], y_score[:, i])
    def abc(self, dataset_array, label_array, data_teste):
        from sklearn.ensemble import AdaBoostClassifier

        clf = AdaBoostClassifier(n_estimators=300)
        clf.fit(dataset_array, label_array)
        return clf.predict(data_teste)
예제 #56
0
features = features.transpose()
arr2t = np.array(arr2)
arr2t = arr2t.transpose()
# print(features)
# print(arr2t)
# features.reshape
classLabelX = arr2t

clf = SVC(gamma='auto')
clf.fit(features, classLabelX)

clfAdaBoost = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1),
                                 algorithm="SAMME",
                                 n_estimators=200)
clfAdaBoost.fit(features, classLabelX)

pred = clf.predict(features)
predAb = clfAdaBoost.predict(features)

# print('Adaboost Predictions : ', predAb)

total = len(arr2)
# print("Total : ",total)

count = 0
for i in range(len(arr2)):
    if arr2[i] == predAb[i]:
        count += 1
print("Adaptive Boost accurate predictions : ", count, "/", total)
print("Adaptive Boost Accuracy : ", (count / total) * 100)
# print("--------------------------")
예제 #57
0
	    def boosting():
	    # Building and fitting 
	    clf = DecisionTreeClassifier(criterion='entropy', max_depth=1)
	    boost = AdaBoostClassifier(base_estimator=clf, n_estimators=500)
	    boost.fit(X_train, y_train)
	    
	    # make class predictions for the testing set
	    y_pred_class = boost.predict(X_test)
	    
	    print('########### Boosting ###############')
	    
	    accuracy_score = evalClassModel(boost, y_test, y_pred_class, True)

	    #Data for final graph
	    methodDict['Boosting'] = accuracy_score * 100

	    boosting()




# 7.stacking

	    def stacking():
	    # Building and fitting 
	    clf1 = KNeighborsClassifier(n_neighbors=1)
	    clf2 = RandomForestClassifier(random_state=1)
	    clf3 = GaussianNB()
	    lr = LogisticRegression()
	    stack = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)
	    stack.fit(X_train, y_train)
	    
	    # make class predictions for the testing set
	    y_pred_class = stack.predict(X_test)
	    
	    print('########### Stacking ###############')
	    
	    accuracy_score = evalClassModel(stack, y_test, y_pred_class, True)

	    #Data for final graph
	    methodDict['Stacking'] = accuracy_score * 100

	    stacking()




# 8.Success method plot  
  
  	def plotSuccess():
	    s = pd.Series(methodDict)
	    s = s.sort_values(ascending=False)
	    plt.figure(figsize=(12,8))
	    #Colors
	    ax = s.plot(kind='bar') 
	    for p in ax.patches:
	        ax.annotate(str(round(p.get_height(),2)), (p.get_x() * 1.005, p.get_height() * 1.005))
	    plt.ylim([70.0, 90.0])
	    plt.xlabel('Method')
	    plt.ylabel('Percentage')
	    plt.title('Success of methods')
	     
	    plt.show()
	    
	    plotSuccess()
예제 #58
0
Bag = BaggingClassifier(base_estimator=model,
                        n_estimators=100,
                        random_state=10).fit(X_train, Y_train)
BagPred = Bag.score(X_test, Y_test)
BagPrediction = Bag.predict(X_test)
print(metrics.confusion_matrix(Y_test, BagPrediction))
print('Bag Accuracy', BagPred)
'''
predictions = cross_validate(AdaBoost,X_train,Y_train,cv=10)
pred_per=np.mean(predictions['test_score'])
print(predictions)


print('The accuracy is: ',pred_per*100,'%')
'''
AdaPred = AdaBoost.predict(X_test)
print(AdaPred)
print(metrics.confusion_matrix(Y_test, AdaPred))

prediction = AdaBoost.score(X_test, Y_test)
print('The boosting accuracy is: ', prediction * 100, '%')
'''

test_data=pd.read_csv('C:/Users/harsh/Desktop/KaggleComp/test.csv')
print(test_data.shape)
print(test_data.columns)
test_data=test_data.loc[:,features].values
#test_data=test_data[test_data.columns[1:86]]
print(test_data.shape)
test_predict=AdaBoost.predict(test_data)
test_predict=np.reshape(test_predict,(1715,1))
예제 #59
0
# gs_tfidf_lsvc = load("./outputs/Pipeline_tfidf_lsvc.pkl")
gs_cv_mlpc = load("./outputs/Pipeline_cv_mlpc.pkl")
gs_tfidf_mlpc = load("./outputs/Pipeline_tfidf_mlpc.pkl")
gs_cv_pac = load("./outputs/Pipeline_cv_pac.pkl")
gs_tfidf_pac = load("./outputs/Pipeline_tfidf_pac.pkl")

# Ensemble
# Reference: https://scikit-learn.org/stable/modules/ensemble.html

# AdaBoost
clf = AdaBoostClassifier(base_estimator=[gs_cv_knc, gs_tfidf_knn],
                         n_estimators=100,
                         algorithm='SAMME',
                         random_state=15)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Stacking classifier
clf = StackingClassifier(estimators=[gs_cv_knc, gs_tfidf_knn],
                         final_estimator=LogisticRegression(
                             class_weight='balanced',
                             multi_class='multinomial'))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#  ('knn', gs_tfidf_knn), ('knc', gs_cv_knc)

sclf = StackingCVClassifier(classifiers=[gs_cv_knc, gs_tfidf_knn],
                            meta_classifier=LogisticRegression(
                                class_weight='balanced',
                                multi_class='multinomial'),
target = ["quality"]

# data = shuffle(data)
# data = data.reset_index()

data = data.sample(frac=1).reset_index(drop=True)

X = data[x].values
Y = data[target].values

standard_sc = StandardScaler()
X = standard_sc.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.1,
                                                    stratify=Y)

# gbc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=20),n_estimators=200, learning_rate=1)
abc = AdaBoostClassifier(base_estimator=RandomForestClassifier(
    n_estimators=200, max_depth=20, min_samples_split=4),
                         n_estimators=200,
                         learning_rate=1)
abc.fit(X_train, Y_train)
predictions = abc.predict(X_test)
print("AccuracyScore: ", accuracy_score(predictions, Y_test))
print("recall_score: ", recall_score(predictions, Y_test, average='macro'))
print("precision_score: ", precision_score(predictions,
                                           Y_test,
                                           average='macro'))