예제 #1
0
        def search(self):
            rois = []
            for i in range(len(self.params['learning_rate'])):
                for j in range(len(self.params['n_estimators'])):
                    clf = ABC(random_state=7,
                              learning_rate=self.params['learning_rate'][i],
                              n_estimators=self.params['n_estimators'][j])
                    clf.fit(self.X_train, self.y_train)
                    predictions = []
                    for match in X_test:
                        predictions.append(clf.predict([match])[0])
                    predictions = np.array(predictions)

                    roi = self.roi_funcs[1](predictions) + self.roi_funcs[3](
                        predictions)

                    rois.append((roi, self.params['learning_rate'][i],
                                 self.params['n_estimators'][j]))
            max_tuple = max(rois, key=lambda x: x[0])
            self.best_params_ = {
                'learning_rate': max_tuple[1],
                'n_estimators': max_tuple[2]
            }
            return ABC(random_state=7,
                       learning_rate=max_tuple[1],
                       n_estimators=max_tuple[2])
예제 #2
0
def get_estimator(algoname, seed=0):
    ''' Returns an estimator object based on the string algoname
    Valid options are outlined in validate_algos().
    Estimators that require a random seed (e.g. dt), should be passed
    a non-zero seed'''

    if(algoname == 'nb'):
        return GaussianNB()

    if(algoname == 'dt'):
        return tree.DecisionTreeClassifier(random_state=seed)
    if(algoname == 'dte'):
        return tree.DecisionTreeClassifier(random_state=seed, \
                criterion="entropy")
    if(algoname == 'lr'):
        return LogisticRegression(penalty='l1', class_weight='auto', \
                random_state=seed)
    if(algoname == 'rfc'):
        return RFC(criterion='entropy', random_state=seed)
    if(algoname == 'bac'):
        return BAC(random_state=seed)
    if(algoname == 'abc'):
        return ABC(random_state=seed)
    # Implementation of a linear SVM. Note: nonlinear will take more time, but
    # will likely have slightly higher performance.
    if(algoname == 'svm'):
        return svm.LinearSVC(random_state=seed)
        #return svm.SVC(random_state=seed)

    # You only get here if the string was invalid
    print("Unrecognized algorithm name")
    return
 def NLMmodelexp1():
     modelExperiment(
         nlmInsampleData, nlmOutsampleData, 'NLMdata/', fullFV,
         [LR(), DT(), KNC(), RF(),
          ABC(), GNB(), QDA()], [
              'LogisticRegression', 'DTree', 'KNN', 'RandomForest',
              'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis'
          ], 'NLMmodelExperiment1.csv', 'NLMclassifier_plot1.png', True)
예제 #4
0
def adacv(n_estimators, learning_rate, seed=seed):
    return cross_val_score(ABC(n_estimators=int(n_estimators),
                               learning_rate=float(learning_rate),
                               random_state=int(seed)),
                           X,
                           y,
                           'roc_auc',
                           cv=10).mean()
예제 #5
0
def get_models(dataset):
    if dataset in ["mnist12", "mnist28"]:
        classifiers = [(DTC(max_depth=30, class_weight='balanced'),
                        "Decision Tree (max_depth=30)"),
                       (LRC(solver='lbfgs',
                            n_jobs=2,
                            multi_class="auto",
                            class_weight='balanced',
                            max_iter=50), "Logistic Regression"),
                       (MLPC((100, ), max_iter=50), "MLP (100)")]
        return classifiers
    if dataset in ['adult']:
        classifiers = [(DTC(max_depth=15, class_weight='balanced'),
                        "Decision Tree (max_depth=20)"),
                       (ABC(), "Adaboost (estimator=50)"),
                       (LRC(solver='lbfgs',
                            n_jobs=2,
                            class_weight='balanced',
                            max_iter=50), "Logistic Regression"),
                       (MLPC((50, ), max_iter=50), "MLP (50)")]
        return classifiers
    if dataset in ['census', 'credit']:
        classifiers = [
            (DTC(max_depth=30,
                 class_weight='balanced'), "Decision Tree (max_depth=30)"),
            (ABC(), "Adaboost (estimator=50)"),
            (MLPC((100, ), max_iter=50), "MLP (100)"),
        ]
        return classifiers
    if dataset in ['intrusion', 'covtype']:
        classifiers = [
            (DTC(max_depth=30,
                 class_weight='balanced'), "Decision Tree (max_depth=30)"),
            (MLPC((100, ), max_iter=50), "MLP (100)"),
        ]
        return classifiers
    if dataset in ['news']:
        regressors = [(LRR(), "Linear Regression"),
                      (MLPR((100, ), max_iter=50), "MLP (100)")]
        return regressors

    assert 0
 def SOmodelexp1():
     modelExperiment(
         SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV,
         [LR(),
          DT(),
          KNC(),
          RF(n_estimators=200),
          ABC(),
          GNB(),
          QDA()], [
              'LogisticRegression', 'DTree', 'KNN', 'RandomForest',
              'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis'
          ], 'SOmodelExperiment1.csv', 'SOclassifier_plot1.png', True)
예제 #7
0
파일: adaboost.py 프로젝트: hiteshishah/IS
def main():
    recipes = []  # list of all the recipes in the dataset
    cuisines = []  # list of all the cuisines in the dataset
    ingredients = set()  # list of individual ingredients used in the recipes
    with open("finaldata.json") as file:
        data = json.load(file)
        for d in data:
            for key, value in d.items():
                if key == "ingredients":
                    recipes.append(value)
                    for item in value:
                        ingredients.add(item)
                elif key == "cuisine":
                    cuisines.append(value)

    # splitting the initial dataset into training and testing datasets
    training_data, testing_data, training_cuisine, testing_cuisine = train_test_split(
        recipes, cuisines, test_size=0.2)

    # initializing a sparse matrix for the training data
    training_data_matrix = scipy.sparse.dok_matrix(
        (len(training_data), len(ingredients)))

    # changing the value to 1 in the training matrix for every ingredient occurs in a recipe
    for i, recipe in enumerate(training_data):
        for j, ingredient in enumerate(ingredients):
            if ingredient in recipe:
                training_data_matrix[i, j] = 1

    pipeline = ABC(n_estimators=300)  # using 300 classifiers
    pipeline.fit(training_data_matrix, training_cuisine
                 )  # building a boosted classifier from the training set

    # initializing a sparse matrix for the testing data
    testing_data_matrix = scipy.sparse.dok_matrix(
        (len(testing_data), len(ingredients)))

    # changing the value to 1 in the testing matrix for every ingredient occurs in a recipe
    for i, recipe in enumerate(testing_data):
        for j, ingredient in enumerate(ingredients):
            if ingredient in recipe:
                testing_data_matrix[i, j] = 1

    # returns the predicted outcome per sample which is computed as the weighted mean prediction of the classifiers in the ensemble
    result = pipeline.predict(testing_data_matrix)

    print(classification_report(testing_cuisine, result))
def singleExperiment(cfg):
    """
    Can only run with access to config variable
    """
    # make data
    X, y = make_classification(n_samples=cfg['n_samples'],
                               n_features=cfg['n_features'])
    # train test split
    X_train, X_test, y_train, y_test = TTS(X, y, test_size=.2)
    # set up RecurrentForest model
    rec_fst_clf = RF.RecurrentForest(X_train,
                             y_train,
                             cfg['T'],
                             cfg['n_trees'],
                             cfg['p_connect'],
                             cfg['p_feature'],
                             cfg['p_example'],
                             cfg['tree_kwargs'])
    # set up RandomForest
    rnd_fst_clf = RFC(**cfg['random_forest_kwargs'])
    # set up AdaBoost
    ada_bst_clf = ABC(**cfg['ada_boost_kwargs'])
    # in a list
    models = [rec_fst_clf,
              rnd_fst_clf,
              ada_bst_clf]
    
    print("<<< training models >>>")
    for m in tqdm(models):
        m.fit(X_train, y_train) # RecurrentForest ignores args - data present at init

    print("<<< testing models >>>")
    y_hats = np.zeros((3, X_test.shape[0]))
    for i, m in tqdm(enumerate(models)):
        if i == 0:
            y_hats[i, :] = m.predictNew(X_test)
        else:
            y_hats[i, :] = m.predict(X_test)

    # get metrics
    measures = np.zeros((3, 4))
    for i in tqdm(range(3)):
        measures[i,:] = M.binary_metrics(y_test, y_hats[i,:], model=str(models[i]))

    return measures
예제 #9
0
def get_classifier(classifier, df_s, j=0, z=0):
    if classifier.upper() == 'LDA':
        CLSFR = LDA(solver='lsqr', shrinkage='auto')

    elif classifier.lower() == 'logistic_bal':
        CLSFR = LR(class_weight='balanced',
                   random_state=5,
                   max_iter=1e4,
                   C=0.1**j,
                   solver='newton-cg')

    elif classifier.lower() == 'logistic_unbal':
        CLSFR = LR(random_state=5, max_iter=1e4, C=0.1**j, solver='newton-cg')

    elif classifier.upper() == 'KNN':
        CLSFR = KNNc(n_neighbors=j)

    elif classifier.lower() == 'ridge_bal':
        CLSFR = RdC(alpha=j, class_weight='balanced', random_state=5)

    elif classifier.lower() == 'ridge_unbal':
        CLSFR = RdC(alpha=j, random_state=5)

    elif classifier.lower() == 'random_forest_bal':
        CLSFR = RFC(n_estimators=int(50 * j),
                    random_state=5,
                    min_samples_leaf=2,
                    class_weight='balanced')

    elif classifier.lower() == 'random_forest_unbal':
        CLSFR = RFC(n_estimators=int(50 * j),
                    random_state=5,
                    min_samples_leaf=2)

    elif classifier.upper() == 'QDA':
        CLSFR = QDA(reg_param=j)

    elif classifier.lower() == 'svc':
        CLSFR = SVC(gamma='scale', random_state=5, probability=True, degree=j)

    elif classifier.lower() == 'abc':
        CLSFR = ABC(base_estimator=RFC(n_estimators=int(50 * j)),
                    random_state=5)

    return CLSFR
예제 #10
0
def AdaBoostpredictor(X_train, y_train, X_test):
    ''' Input traning data ,target, and test data
    Output prabability of each label for test data'''
    from sklearn.ensemble import AdaBoostClassifier as ABC

    # Cross validation may not be needed for random forest classifier
    model = ABC(random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy = metrics.accuracy_score(y_train, y_pred)
    logLoss = metrics.log_loss(y_train, y_pred)

    y_pred = model.predict(X_test)
    modelName = model.__class__.__name__
    accModels[modelName] = accuracy
    predictions[modelName] = y_pred

    return y_pred, accuracy
	def abcScores(self,Xn,y,cv=5,param_name='n_estimators',paramRange=(1,10,1),trainW=1,testW=2,title='Adaboost classifier',clfArg={},plot=False):
		"""
		Perform the validation_curve function using Adaboost classifier (ABC) 
		and get the best param value based on the highest test_score. 
		cv indicates the cross validation k-fold. Default param to optimize is max_depth. 
		paramRange=(a,b,c) is the range to evaluate the param_name. a start degree, b end degree, c step.
		After the function gets the best param value, associated test_score and train_score 
		are used to calculated a weighted_score.
		trainW and testW are the weights used to calculated a 
		weighted_score=test_score*testW+train_score*trainW)/(testW+trainW).
		clfArg is a dictionary to add any additional parameters to the ABC. 
		To see how the best score is collected set plot=True. 
		"""
		clf=ABC(**clfArg)
		model_scores=list()
		param_range=np.arange(paramRange[0],paramRange[1],paramRange[2])
		train_sc, test_sc = validation_curve(clf,Xn,y,param_name=param_name,param_range=param_range,cv=cv)
		param_score=self.plotTrainTest(train_sc,test_sc,param_range,t=title,xlabel=param_name,plot=plot)
		scoreDic={'model':title,'param_name':param_name}
		scoreDic.update(param_score)
		model_scores.append(scoreDic.copy())
		return self.scoreModelListDf(model_scores,trainW=trainW,testW=testW)
    def make_feature_graph(self,
                           feature_list,
                           labels_filename="trainingSetLabels.dat"):
        '''
			Function to plot 2 graphs:
				1. Decision Boundaries: Takes atmost 2 features for every sample and plots decision boundaries defined by 5 classifiers: 
					['Logistic Regression', 'Random Forest', 'Naive Bayes', 'SVM', 'AdaBoost']
				2. Scatter Plot: Plots the values of each data point on a Scatter plot to visualise how separable they seem.
								 This is not performed on any classifier. For manual evaluation only. 

			Parametrs:
				feature_list: A list of lists containing the features for each sample.
				labels_filename: Path to the filename containing the labels for the training data
		'''

        y = []
        with open(labels_filename) as label_file:
            x_true_list = []
            x_fake_list = []
            for idx, label in enumerate(label_file):
                if int(label):
                    y.append(1)
                    x_true_list.append(feature_list[idx])
                else:
                    y.append(0)
                    x_fake_list.append(feature_list[idx])

        y = np.array(y)
        X_plot = feature_list

        #---------------------------- Decision Boundary Plot -----------------------#
        if len(feature_list[0]) == 1 or len(feature_list[0]) == 2:
            print "Now plotting Decision boundary Plot. (Works best for 2 features)"

            gs = gridspec.GridSpec(2, 2)

            fig = plt.figure(figsize=(10, 8))

            clf1 = LogisticRegression(random_state=1)
            clf2 = RFC(n_estimators=100, random_state=1)
            clf3 = GNB()
            clf4 = SVC()
            clf5 = ABC()

            labels = [
                'Logistic Regression', 'Random Forest', 'Naive Bayes', 'SVM',
                'AdaBoost'
            ]
            for clf, lab, grd in zip([clf1, clf2, clf3, clf4, clf5], labels,
                                     itertools.product([0, 1], repeat=2)):

                clf.fit(X_plot, y)
                ax = plt.subplot(gs[grd[0], grd[1]])
                fig = plot_decision_regions(X=X_plot, y=y, clf=clf, legend=2)
                plt.title(lab)

            plt.show()

        #---------------------------- Individual Scatter Plot -----------------------#
        plot_idx = 0
        if len(feature_list[0]) != 1:
            plot_idx = int(
                raw_input(
                    "Your list has more than 1 feature. Which feature would you like to observe? (Insert Index): "
                ))

        print "Now plotting scatter plot of feature:"
        x_true = [feat[plot_idx] for feat in x_true_list]
        x_fake = [feat[plot_idx] for feat in x_fake_list]

        x_true = np.array(x_true)
        x_fake = np.array(x_fake)
        y_plot = np.arange(max(len(x_true), len(x_fake)))

        trace_true = go.Scatter(y=x_true,
                                x=y_plot,
                                mode='markers',
                                text="True")
        trace_fake = go.Scatter(y=x_fake,
                                x=y_plot,
                                mode='markers',
                                text="Fake")

        data = [trace_true, trace_fake]
        layout = go.Layout(showlegend=False)
        fig = go.Figure(data=data, layout=layout)
        plot_url = offline.plot(fig, filename='text-chart-basic')
    def article_classifier(self):

        train_pos, dev_pos = self.pos_load_features()

        rare_ttr_perplexity_4gram_features = list(
            extractFourGram('featureFour.txt', 'basic.csv'))

        X_dev = list(extractFourGram('featureFour_dev.txt', 'basic_dev.csv'))
        y_dev = self.get_dev_labels()

        X = rare_ttr_perplexity_4gram_features
        y = self.labels

        X.append(train_pos)
        X_dev.append(dev_pos)

        X = np.array(X).T[:, :]
        X_dev = np.array(X_dev).T[:, :]

        # self.make_feature_graph(X[:,1:3],"trainingSetLabels.dat")

        lr_clf = LogisticRegression()
        lr_clf.fit(X, y)
        lr_predicted = lr_clf.predict(X_dev)
        lr_scores = cross_val_score(lr_clf, X, y, cv=5, n_jobs=5)
        print lr_scores, np.mean(lr_scores), np.std(lr_scores)
        # svm_predicted = cross_val_predict(lr_clf, X, y, cv=5)
        print accuracy_score(y_dev, lr_predicted)

        # SVM Parameters:
        # {'C': [0.1,1.0,10.0,100.0], 'gamma':[1.0,2.0,'auto',0.1,0.01,0.001], 'kernel':['rbf','linear']}
        svm_clf = SVC(probability=True)
        svm_clf.fit(X, y)
        svm_predicted = svm_clf.predict(X_dev)
        svm_scores = cross_val_score(svm_clf, X, y, cv=5, n_jobs=5)
        print svm_scores, np.mean(svm_scores), np.std(svm_scores)
        # svm_predicted = cross_val_predict(svm_clf, X, y, cv=5)
        print accuracy_score(y_dev, svm_predicted)

        # RandomForest Parameters:
        # {'n_estimators':[10,20,5,30],'criterion':['gini','entropy']}
        rf_clf = RFC()
        rf_clf.fit(X, y)
        rf_predicted = rf_clf.predict(X_dev)
        rf_scores = cross_val_score(rf_clf, X, y, cv=5, n_jobs=5)
        print rf_scores, np.mean(rf_scores), np.std(rf_scores)
        # rf_predicted = cross_val_predict(rf_clf, X, y, cv=5)
        print accuracy_score(y_dev, rf_predicted)

        # AdaBoost Parameters:
        # {'n_estimators':[10,20,5,30],'learning_rate':[1.0,0.1,0.01,0.001,0.05]}
        ab_clf = ABC()
        ab_clf.fit(X, y)
        ab_predicted = ab_clf.predict(X_dev)
        ab_scores = cross_val_score(ab_clf, X, y, cv=5, n_jobs=5)
        print ab_scores, np.mean(ab_scores), np.std(ab_scores)
        # ab_predicted = cross_val_predict(ab_clf, X, y, cv=5)
        print accuracy_score(y_dev, ab_predicted)

        # Gaussian NB Parameters:
        # {'n_estimators':[10,20,5,30],'learning_rate':[1.0,0.1,0.01,0.001,0.05]}
        nb_clf = GNB()
        nb_clf.fit(X, y)
        nb_predicted = nb_clf.predict(X_dev)
        nb_scores = cross_val_score(nb_clf, X, y, cv=5, n_jobs=5)
        print nb_scores, np.mean(nb_scores), np.std(nb_scores)
        # nb_predicted = cross_val_predict(nb_clf, X, y, cv=5)
        print accuracy_score(y_dev, nb_predicted)
예제 #14
0
if __name__ == "__main__":

    X, y = make_classification(n_samples=10000,
                               n_features=10,
                               n_informative=5,
                               random_state=0,
                               n_classes=2)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    from sklearn.ensemble import AdaBoostClassifier as ABC
    clf = ABC(DecisionTreeClassifier(max_depth=1),
              n_estimators=20,
              algorithm="SAMME")
    clf.fit(X_train, y_train)
    result = clf.predict(X_test)
    print("sklearn中SAMME的验证集得分为: ", accuracy_score(y_test, result))

    clf = AdaboostClassifier(DecisionTreeClassifier, 20, "SAMME")
    clf.fit(X_train, y_train, max_depth=1)
    result = clf.predict(X_test)
    print("使用SAMME.R集成的验证集得分为: ", accuracy_score(y_test, result))

    clf = ABC(DecisionTreeClassifier(max_depth=1),
              n_estimators=20,
              algorithm="SAMME.R")
    clf.fit(X_train, y_train)
    result = clf.predict(X_test)
예제 #15
0
            
    results['acc_train'] = accuracy_score(y_train[:300], predictions_train)
        
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    results['f_train'] = fbeta_score(y_train[:300], predictions_train, 0.5)
        
    results['f_test'] = fbeta_score(y_test, predictions_test, 0.5)
       
    # Success
    print "{} trained on {} samples.".format(learner.__class__.__name__, sample_size)
        
    # Return the results
    return results

clf_A = ABC(random_state = 42)
clf_B = DTC(random_state = 42)
clf_C = LinearSVC(random_state = 42)


samples_1 = len(X_train)/100
samples_10 = len(X_train)/10
samples_100 = len(X_train)


# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
        clf_name = clf.__class__.__name__
        results[clf_name] = {}
        for i, samples in enumerate([samples_1, samples_10, samples_100]):
def main():
    fullFV = [
        csAbstract, csSentence, jac, jacq3, dice, diceq3, cosM, cosMq3, LVdist,
        sw, nw, jw
    ]
    fullModels = [LR(), DT(), KNC(), RF(n_estimators=200), ABC(), GNB(), QDA()]
    fullModelNames = [
        'LogisticRegression', 'DTree', 'KNN', 'RandomForest', 'AdaBoosted',
        'GaussianNB', 'QuadraticDiscriminantAnalysis'
    ]

    #modelExperiment(nlmInsampleData,nlmOutsampleData,'NLMdata/',fullFV,[LR(),DT(),KNC(),RF(),ABC(),GNB(),QDA()],
    #                ['LogisticRegression','DTree','KNN','RandomForest','AdaBoosted','GaussianNB','QuadraticDiscriminantAnalysis'],
    #                'NLMmodelExperiment1.csv','NLMclassifier_plot1.png',True)

    def SOmodelexp1():
        modelExperiment(
            SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV,
            [LR(),
             DT(),
             KNC(),
             RF(n_estimators=200),
             ABC(),
             GNB(),
             QDA()], [
                 'LogisticRegression', 'DTree', 'KNN', 'RandomForest',
                 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis'
             ], 'SOmodelExperiment1.csv', 'SOclassifier_plot1.png', True)

    def SOmodelexp2():
        modelExperiment(
            SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV,
            [LR(), RF(n_estimators=200), ABC()],
            ['LogisticRegression', 'RandomForest', 'AdaBoosted'],
            'SOmodelExperiment2.csv', 'SOclassifier_plot2.png', True)

    def NLMmodelexp1():
        modelExperiment(
            nlmInsampleData, nlmOutsampleData, 'NLMdata/', fullFV,
            [LR(), DT(), KNC(), RF(),
             ABC(), GNB(), QDA()], [
                 'LogisticRegression', 'DTree', 'KNN', 'RandomForest',
                 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis'
             ], 'NLMmodelExperiment1.csv', 'NLMclassifier_plot1.png', True)

    #featureVectorExperiment(SOInsampleData,SOOutsampleData,'stackoverflowdata/',[[jacq3],[cosM],[cosMq3],[jacq3,cosM],[cosM,cosMq3],[csAbstract]],DT(),
    #                'DTree','SOFVExperiment1.csv','SOFV_plot1.png')

    def NLMexperiments():
        j2 = myJoin.join(nlmInsampleData, nlmOutsampleData, 'NLMdata/')
        j2.setComponentList(fullFV)
        j2.loadCachedInsampleFV()
        results = []
        for prop in np.arange(0.05, 0.25, 0.01):
            precision, recall, _, size = j2.classifyNIterations(
                subSampleProportion=prop)
            results.append([size, precision, recall])
        writeToCSV('NLMdata/sizeTest2.csv', ['Size', 'Precision', 'Recall'],
                   results)

    def SOexperiments():
        j1 = myJoin.join(SOInsampleData, SOOutsampleData, 'stackoverflowdata/')
        j1.setComponentList(fullFV)
        j1.buildInsampleFV()
        j1.model = RF(n_estimators=200)
        j1.modelName = 'RF'

        def threshHoldTest():
            singleThreshTest = j1.thresholdTest(np.arange(0.0, 1.01, 0.01))
            writeToCSV('stackoverflowdata/simpleThresholdTest1.csv',
                       ['Threshold', 'Precision', 'Recall'], singleThreshTest)
            print 'simple thing done'
            fiftyThreshTest = [
                j1.thresholdTest(np.arange(0.0, 1.01, 0.01)) for i in range(50)
            ]
            mean_values = np.mean(fiftyThreshTest, axis=0)
            writeToCSV('stackoverflowdata/fiftyThresholdTest1.csv',
                       ['Threshold', 'Precision', 'Recall'], mean_values)
            print 'fifty thing done'
            thresholdExperiment(SOInsampleData, SOOutsampleData,
                                'stackoverflowdata/', fullFV,
                                RF(n_estimators=200), 'RF',
                                'thresholdExperiment1.csv')

    #SOexperiments()
    NLMexperiments()
예제 #17
0
from sklearn.svm import SVC

model = SVC(random_state=42).fit(X_train, y_train)
prediction = model.predict(X_test)
score = accuracy_score(y_test, prediction)
print(score)

# ADA Boost Classifier
# --------------------

# In[21]:

from sklearn.ensemble import AdaBoostClassifier as ABC

model = ABC(n_estimators=100, random_state=42,
            learning_rate=.80).fit(X_train, y_train)
prediction = model.predict(X_test)
score = accuracy_score(y_test, prediction)
print(score)

# Bagging Classifier
# ----------------

# In[22]:

from sklearn.ensemble import BaggingClassifier as BC

model = BC(n_estimators=100, random_state=42).fit(X_train, y_train)
prediction = model.predict(X_test)
score = accuracy_score(y_test, prediction)
print(score)
예제 #18
0
    val['target'] = val.apply(lambda row: make_target(row), axis=1)
    val['target_num'] = val['target'].map(target_dict)
    val['is_EOH'] = val.apply(lambda row: end_of_half_det(row), axis=1)
    val['pos_leads'] = (val['posteam_score'] >
                        val['defteam_score']).astype(int)
    to_drop = [
        'Unnamed: 0', 'game_date', 'game_id', 'ends_TD', 'ends_FG',
        'ends_punt', 'ends_other', 'target', 'target_num'
    ]
    targets = ['ends_TD', 'ends_FG', 'ends_punt', 'ends_other']
    features = [c for c in start.columns if c not in to_drop]
    train, test = tt_split(start)
    y_train = train['target_num'].values
    X_train = train[features].values
    abc = ABC(base_estimator=DTC(max_depth=2),
              n_estimators=500,
              learning_rate=0.25)
    abc.fit(X_train, y_train)
    y_test = test['target_num'].values
    X_test = test[features].values
    score = abc.score(X_test, y_test)
    print(f'Test: {score:0.3f}')
    X_val = val[features].values
    y_val = val['target_num']
    val_score = abc.score(X_val, y_val)
    print(f'AB Validation: {val_score:0.3f}')

    rf = RFC(n_estimators=500,
             max_depth=40,
             bootstrap=False,
             max_features=5,
예제 #19
0
                          delimiter="|",
                          skip_header=1)

X = training_data[:, :1000]
Y = training_data[:, 1000]

# Various Classifiers
dtc_min_samples_leaf = DTC(min_samples_leaf=15)
etc = ETC()
gbc = GBC()
rfc = RFC()
dtc_max_depth = DTC(max_depth=8)
nb = BernoulliNB()
svc = SVC()
lr = LR()
abc = ABC()
bc = BC()
'''
inv_doc_freq = np.zeros(1000)
for i in range(len(inv_doc_freq)):
    total = sum(X[:, i])
    if total == 0:
        inv_doc_freq[i] = 0
    else:
        inv_doc_freq[i] = math.log(N / sum(X[:, i]))
'''

# Data normalization
for i in range(len(X)):
    max_freq = max(X[i])
    if max_freq == 0:
예제 #20
0
def abc(train_examples, train_labels, test_examples, test_labels, verbose):
    model = ABC(n_estimators=500)
    model.fit(train_examples, train_labels)
    score = model.score(test_examples, test_labels)
    print("CONVERGENCE: ", model.score(train_examples, train_labels))
    return score
예제 #21
0
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
# plt.show()
################################################################################

# your code here!  name your classifier object clf if you want the
# visualization code (prettyPicture) to show you the decision boundary
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.tree import DecisionTreeClassifier as DTC

clf_list = [RFC(), ABC(), KNC()]
acc_list = []
# RandomForestClassifier
# n_estimators ~10
# criterion = 'entropy'/ 'gini'
# max_features ~0.4
# Max acc: 0.94

# KNeighborsClassifier
# n_neighbors: 8
# weights: uniform
# algorithm: any
# Max acc: 0.944

# AdaBoostClassifier
# base_estimator:
예제 #22
0
# Gradient Boosting Classifier
gb_cls = GBC(loss='deviance', max_features=None, learning_rate=0.125,
             n_estimators=150, min_samples_split=2, min_samples_leaf=20,
             max_depth=5, min_impurity_decrease=0.20, max_leaf_nodes=10,
             random_state=5)

# Isolation Forest
if_cls = IFc(random_state=5)
if_param = {'n_estimators': [100, 200, 300],
            'contamination': [0.05, 0.1, 0.2],
            'max_features': [0.5, 0.75, 1.0],
            'bootstrap': [True, False],
            'behaviour': ['new']}
#run_func(if_cls, if_param, 7, X, y, 'iso_frst')

# Ada Boost Classifier
ab_cls = ABC(random_state=5, algorithm='SAMME')
ab_param = {'base_estimator': [lr_cls, rd_cls, rf_cls, gb_cls, et_cls],
            'n_estimators': [10, 50, 100],
            'learning_rate': [0.5, 1.0, 2.0]}

# Bagging Classifier
bg_cls = BGC(random_state=5)
bg_param = {'base_estimator': [lr_cls, kn_cls, ld_cls, rd_cls, rf_cls, gb_cls,
                               et_cls],
            'n_estimators': [10, 50, 100],
            'max_features': [0.25, 0.5, 0.75, 1.0],
            'bootstrap': [True, False],
            'bootstrap_features': [True, False]}

예제 #23
0
파일: boosting.py 프로젝트: ndcorc/ut-ece
import SVM as CLF
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import cross_val_score as cvs
from sklearn.ensemble import AdaBoostClassifier as ABC

df, salary, keys = CLF.clean(CLF.get_data())
estimators = [10, 20, 30, 40, 50, 100, 200, 400]
estimator_scores = []
for estimator in estimators:
    clf = ABC(n_estimators=estimator)
    estimator_scores.append(cvs(clf, df, salary).mean())

learning_rates = [1, 10, 20, 30, 40, 50, 100, 200]
learning_scores = []
best_estimator = estimators[estimator_scores.index(max(estimator_scores))]
for rate in learning_rates:
    clf = ABC(n_estimators=best_estimator, learning_rate=rate)
    learning_scores.append(cvs(clf, df, salary).mean())

n_estimators = 400
# A learning rate of 1. may not be optimal for both SAMME and SAMME.R
learning_rate = 1.

fig = plt.figure()
ax = fig.add_subplot(111)


예제 #24
0
파일: base.py 프로젝트: djsensei/HiggsClass
def ada_init(n=100):
  ada = ABC(n_estimators = n)
  return ada
예제 #25
0
#!/usr/bin/env python
if __name__ == '__main__':
    from sklearn.ensemble import AdaBoostClassifier as ABC
    from sklearn.tree import DecisionTreeClassifier as DTC
    import numpy as np
    from sklearn.metrics import accuracy_score
    from final_utils import read_hwfile

    # initialize data
    dat, lab, nDat = read_hwfile('ml14fall_train_align.dat.hog.dat', 169)
    nVal = nDat/5
    nTrn = nDat-nVal
    datTrn = dat[:nTrn]
    labTrn = lab[:nTrn]
    datVal = dat[-nVal:]
    labVal = lab[-nVal:]
    print "#trn = {}, #val = {}".format(nTrn, nVal)


    classfier = ABC(DTC(max_depth=6, max_features=1), n_estimators=50000)
    classfier.fit(datTrn, labTrn)

    for i, labPre in enumerate(classfier.staged_predict(datVal)):
	if i % 10 == 9:
	    print accuracy_score(labPre, labVal)
예제 #26
0
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
'''
from sklearn.grid_search import GridSearchCV

Using param_grid and GridSearchCV to tune the algorithm's parameters
param_grid = {
			'n_estimators': [3, 7, 9, 11, 15, 21, 23, 27],
          }
clf = GridSearchCV(ABC(), param_grid)
'''
#The parameters are hardcoded because when leaving the GridSearchCV and testing with
#tester.py scores are low, but when hardcoded they are high as expected.
clf = ABC(algorithm='SAMME.R',
          base_estimator=None,
          learning_rate=1.0,
          n_estimators=23,
          random_state=None)

clf.fit(X_train, y_train)  #fitting the data
'''
print "Best estimator found by grid search:"
print clf.best_estimator_
'''
pred = clf.predict(X_test)
acc = accuracy_score(pred, y_test)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)

print 'Accuracy:  ', acc
print 'Recall:    ', recall
        if mf == 'None':
            mf = None
        mss = get_input('Please input value for min_samples_split:\t')
        from sklearn.ensemble import RandomForestClassifier as RFC
        clf = RFC(n_estimators=n,
                  criterion=c,
                  max_features=mf,
                  min_samples_split=mss)
        parameters = ('n= ' + str(n)) + (', c= ' + c) + (', mf= ' + temp) + (
            ', mss= ' + str(mss))
        a, b, c, d = run_clf(clf)
        Classifiers_df.loc[counter] = ['RF', parameters, a, b, c, d]
        display(Classifiers_df.loc[counter])
        counter += 1
    elif user_input == 5:
        print 'You chose AdaBoost Classifier'
        n = get_input('Please input value for n_estimators:\t')
        from sklearn.ensemble import AdaBoostClassifier as ABC
        clf = ABC(n_estimators=n)
        parameters = ('n= ' + str(n))
        a, b, c, d = run_clf(clf)
        Classifiers_df.loc[counter] = ['AB', parameters, a, b, c, d]
        display(Classifiers_df.loc[counter])
        counter += 1
    print_in()
    user_input = get_input('Please input a number:\n')

display(Classifiers_df.sort_values(by=['Accuracy', 'Total'], ascending=[0, 1]))

#KNN parameters    nb= 3, w= distance, a= ball_tree, ls= 45, 131.224, 0.976109
#RF, n= 3, c= entropy, mf= None, mss= 15, 41.773, 0.98066
 def SOmodelexp2():
     modelExperiment(
         SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV,
         [LR(), RF(n_estimators=200), ABC()],
         ['LogisticRegression', 'RandomForest', 'AdaBoosted'],
         'SOmodelExperiment2.csv', 'SOclassifier_plot2.png', True)
예제 #29
0
 def __init__(self, n_estimators, matrix_database):
     self._matrix_database = matrix_database
     self._abc = ABC(n_estimators=n_estimators)
     self._has_fit = False
print("Checkpoint I")
#create a model instance of naive-bayes

naive_instance = nb()

print("Checkpoint II")

naive_instance.fit(X_train, Y_train)
print("Classification Score for Naive-Bayes is -:",
      naive_instance.score(X_test, Y_test))

print("Checkpoint III")

from sklearn.ensemble import AdaBoostClassifier as ABC
#create a model instance of AdaBoost
adaboost = ABC()
adaboost.fit(X_train, Y_train)
print("Classification Score for AdaBoost -: ", adaboost.score(X_test, Y_test))

print("Checkpoint IV")

from sklearn.ensemble import RandomForestClassifier as rf
#create a model instance of RandomForest
clf = rf()
clf.fit(X_train, Y_train)
print("Classification Score for RandomForestClassifier -: ",
      clf.score(X_test, Y_test))

from sklearn.ensemble import ExtraTreesClassifier as etc
#create a model instance of ExtraTreesClassifier
extratrees = etc()