Пример #1
1
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    assert_warns_message(UserWarning,
                         "Warm-start fitting without increasing n_estimators does not",
                         clf.fit, X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
def bagging(X_train, X_test, y_train, y_test,n_est):
    n_est=51
    estimators=range(1,n_est)
    decision_clf = DecisionTreeClassifier()
    
    for est in estimators:
        bagging_clf = BaggingClassifier(decision_clf, n_estimators=est, max_samples=0.67,max_features=0.67, 
                                    bootstrap=True, random_state=9)
        bagging_clf.fit(X_train, y_train)
        # test line
        y_pred_bagging1 = bagging_clf.predict(X_test)
        score_bc_dt1 = accuracy_score(y_test, y_pred_bagging1)
        scores1.append(score_bc_dt1)
        # train line
        y_pred_bagging2 = bagging_clf.predict(X_train)
        score_bc_dt2 = accuracy_score(y_train, y_pred_bagging2)
        scores2.append(score_bc_dt2)
    
    plt.figure(figsize=(10, 6))
    plt.title('Bagging Info')
    plt.xlabel('Estimators')
    plt.ylabel('Scores')
    plt.plot(estimators,scores1,'g',label='test line', linewidth=3)
    plt.plot(estimators,scores2,'c',label='train line', linewidth=3)
    plt.legend()
    plt.show()
Пример #3
0
def main():
    '''main function'''
    bagging = BaggingClassifier(DecisionTreeClassifier())
    iris = load_iris()
    x = iris.data
    y = iris.target
    #train, test, train_, test_ = train_test_split(x, y, test_size=0.2, random_state=42)
    bagging.fit(x, y)
    bagging.predict(x[:2])
    print(bagging.score(x[:2], y[:2]))
def bagging_with_base_estimator(base_estimator, x_train, x_test, y_train,
                                y_test, rands = None):
    """
    Predict the lemons using a Bagging Classifier and a random seed
    both for the number of features, as well as for the size of the
    sample to train the data on

    ARGS:

        - x_train: :class:`pandas.DataFrame` of the x_training data

        - y_train: :class:`pandas.Series` of the y_training data

        - x_test: :class:`pandas.DataFrame` of the x_testing data

        - y_test: :class:`pandas.Series` of the y_testing data

        - rands: a :class:`tuple` of the (rs, rf) to seed the sample
        and features of the BaggingClassifier.  If `None`, then
        rands are generated and provided in the return `Series`

    RETURNS:

        :class:`pandas.Series` of the f1-scores and random seeds
    """
    #create a dictionary for the return values
    ret_d = {'train-f1':[], 'test-f1':[], 'rs':[], 'rf':[]}

    #use the randoms provided if there are any, otherwise generate them
    if not rands:
        rs =  numpy.random.rand()
        rf = numpy.random.rand()
        while rf < 0.1:
            rf = numpy.random.rand()
    else:
        rs, rf = rands[0], rands[1]
    #place them into the dictionary
    ret_d['rs'], ret_d['rf'] = rs, rf
    #create and run the bagging classifier
    bc = BaggingClassifier(base_estimator = base_estimator, n_estimators = 300,
                           max_samples = rs, max_features = rf, n_jobs = 1)

    bc.fit(x_train, y_train)
    y_hat_train = bc.predict(x_train)
    ret_d['train-f1'] = f1_score(y_train, y_hat_train)
    y_hat_test = bc.predict(x_test)
    ret_d['test-f1'] = f1_score(y_test, y_hat_test)
    return pandas.Series(ret_d)
def baggedDecisionTree( X_train, y_train, X_test, y_test, nEstimators ):

    print("\n### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###")
    print("baggedDecisionTree()\n")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myBaggedDecisionTree = BaggingClassifier(
        base_estimator = DecisionTreeClassifier(),
        n_estimators   = nEstimators,
        # max_samples    = X_train.shape[0],
        bootstrap      = True,
        oob_score      = True,
        n_jobs         = -1 # use all available cores
        )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myBaggedDecisionTree.fit(X_train,y_train)
    y_pred = myBaggedDecisionTree.predict(X_test)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    print( "nEstimators: "      + str(nEstimators)                     )
    print( "out-of-bag score: " + str(myBaggedDecisionTree.oob_score_) )
    print( "accuracy score: "   + str(accuracy_score(y_test,y_pred))   )
    print( "out-of-bag decision function:" )
    print( str(myBaggedDecisionTree.oob_decision_function_) )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Пример #6
0
class ADABoost(Base):

    def train(self, data = None, plugin=None):
        """ With dataframe train mllib """
        super(ADABoost, self).train(data, plugin)

            #cl = svm.SVC(gamma=0.001, C= 100, kernel='linear', probability=True)

        X = self.X_train.iloc[:,:-1]
        Y = self.X_train.iloc[:,-1]

        self.scaler = StandardScaler().fit(X)
        X = self.scaler.transform(X)

        cl = SGDClassifier(loss='hinge')
        p = Pipeline([("Scaler", self.scaler), ("svm", cl)])

        self.clf = BaggingClassifier(p, n_estimators=50)
        #self.clf = AdaBoostClassifier(p, n_estimators=10)
            #self.clf = AdaBoostClassifier(SGDClassifier(loss='hinge'),algorithm='SAMME', n_estimators=10)

        self.clf.fit(X, Y)

    def predict(self, file, plugin=None):
        super(ADABoost, self).predict(file, plugin)

        data = file.vector
        X = data[plugin]
        X = self.scaler.transform(X)
        guess = self.clf.predict(X)
        return self.getTag(guess)
Пример #7
0
def test_bagging_classifier_with_missing_inputs():
    # Check that BaggingClassifier can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y = np.array([3, 6, 6, 6, 6])
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(
        FunctionTransformer(replace, validate=False),
        classifier
    )
    pipeline.fit(X, y).predict(X)
    bagging_classifier = BaggingClassifier(pipeline)
    bagging_classifier.fit(X, y)
    y_hat = bagging_classifier.predict(X)
    assert_equal(y.shape, y_hat.shape)
    bagging_classifier.predict_log_proba(X)
    bagging_classifier.predict_proba(X)

    # Verify that exceptions can be raised by wrapper classifier
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(classifier)
    assert_raises(ValueError, pipeline.fit, X, y)
    bagging_classifier = BaggingClassifier(pipeline)
    assert_raises(ValueError, bagging_classifier.fit, X, y)
Пример #8
0
def train_classifiers(data):
    train_vars = [
        'X', 'Y',
        'Darkness',
        'Moon',
        'Hour',
        'DayOfWeekInt',
        'Day',
        'Month',
        'Year',
        'PdDistrictInt',
        'TemperatureC',
        'Precipitationmm',
        'InPdDistrict',
        'Conditions',
        'AddressCode',
    ]
    weather_mapping = {
        'Light Drizzle': 1,
        'Drizzle': 2,
        'Light Rain': 3,
        'Rain': 4,
        'Heavy Rain': 5,
        'Thunderstorm': 6,
    }
    data.Precipitationmm = data.Precipitationmm.fillna(-1)
    data.Conditions = data.Conditions.map(weather_mapping).fillna(0)

    train, test = split(data)
    X_train = train[train_vars]
    y_train = train.CategoryInt
    X_test = test[train_vars]
    y_test = test.CategoryInt

    bdt_real_2 = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=8),
        n_estimators=10,
        learning_rate=1
    )

    #bdt_real = DecisionTreeClassifier(max_depth=None, min_samples_split=1,
                                      #random_state=6065)

    bdt_real = BaggingClassifier(base_estimator=bdt_real_2,
                                random_state=6065,
                                n_estimators=100)

    #bdt_real = RandomForestClassifier(random_state=6065,
                                      #n_estimators=200)

    #bdt_real = ExtraTreesClassifier(random_state=6065,
                                    #min_samples_split=5,
                                    #n_estimators=200)

    bdt_real.fit(X_train, y_train)
    y_predict = pandas.Series(bdt_real.predict(X_test))
    print len(y_predict[y_predict == y_test])
    print len(y_predict)
    return bdt_real
Пример #9
0
    def classification(self, x_train, y_train):
        ml = BaggingClassifier(DecisionTreeClassifier())
        ml.fit(x_train, y_train)
#         print y_train[0]
#         print x_train[0]
        y_pred = ml.predict(x_train)
        print 'y_train ',y_train
        print 'y_pred ',y_pred.tolist()
Пример #10
0
def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_array_almost_equal(y1, y2)
Пример #11
0
def test_sparse_classification():
    # Check classification for various parameter settings on sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set"""

        def fit(self, X, y):
            super(CustomSVC, self).fit(X, y)
            self.data_type_ = type(X)
            return self

    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)
    parameter_sets = [
        {"max_samples": 0.5,
         "max_features": 2,
         "bootstrap": True,
         "bootstrap_features": True},
        {"max_samples": 1.0,
         "max_features": 4,
         "bootstrap": True,
         "bootstrap_features": True},
        {"max_features": 2,
         "bootstrap": False,
         "bootstrap_features": True},
        {"max_samples": 0.5,
         "bootstrap": True,
         "bootstrap_features": False},
    ]

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)
        for params in parameter_sets:

            # Trained on sparse format
            sparse_classifier = BaggingClassifier(
                base_estimator=CustomSVC(),
                random_state=1,
                **params
            ).fit(X_train_sparse, y_train)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_results = BaggingClassifier(
                base_estimator=CustomSVC(),
                random_state=1,
                **params
            ).fit(X_train, y_train).predict(X_test)

            sparse_type = type(X_train_sparse)
            types = [i.data_type_ for i in sparse_classifier.estimators_]

            assert_array_equal(sparse_results, dense_results)
            assert all([t == sparse_type for t in types])
Пример #12
0
def adaboost_train(train_file,test_file):
    _,x,y = readFile(train_file)
    print 'reading done.'
    ts = x.shape[0]
    id,x2 = readFile(test_file)
    
    print x.shape
    print x2.shape    

    x = np.concatenate((x,x2))
    print 'concatenate done.'
    from sklearn.preprocessing import scale
    x = scale(x,with_mean=False)
    print 'scale done.'

    x2 = x[ts:]
    x=x[0:ts]

    from sklearn.feature_selection import SelectKBest,chi2
    x = SelectKBest(chi2,k=50000).fit_transform(x,y)    


    from sklearn.cross_validation import train_test_split
    tmp_array = np.arange(x.shape[0])
    train_i, test_i = train_test_split(tmp_array, train_size = 0.8, random_state = 500)

    train_x = x[train_i]
    test_x = x[test_i]
    train_y = y[train_i]
    test_y = y[test_i]

    from sklearn.ensemble import BaggingClassifier
    bagging = BaggingClassifier(LR(penalty='l2',dual=True),n_estimators = 10,max_samples=0.6,max_features=0.6)
    bagging.fit(train_x,train_y)
    print 'train done.' 
    res = bagging.predict(train_x)
    print res
    from sklearn.metrics import roc_auc_score
    score = roc_auc_score(train_y,res)
    
    res = bagging.predict_proba(train_x)
    print res
    score = roc_auc_score(train_y,res[:,1])
    print score
    print '-----------------------------------------'
    
    print res[:,1]
    res = bagging.predict_proba(test_x)
    score = roc_auc_score(test_y,res[:,1])
    print score

    y=bagging.predict_proba(x2)
    output = pd.DataFrame( data={"id":id, "sentiment":y[:,1]} )
    output.to_csv( "/home/chuangxin/Bagging_result.csv", index=False, quoting=3 )

    return bagging
Пример #13
0
class BaggingLearner(AbstractLearner):

    def __init__(self):
        self.learner = BaggingClassifier(KNeighborsClassifier())

    def _train(self, x_train, y_train):
        self.learner = self.learner.fit(x_train, y_train)

    def _predict(self, x):
        return self.learner.predict(x)

    def _predict_proba(self, x):
        return self.learner.predict_proba(x)
Пример #14
0
class BaggingDecisionTrees(object):

    def __init__(self, n_estimators):
        self.classifier = BaggingClassifier(n_estimators=n_estimators)

    def fit(self, xs, ys):
        xs = xs.values
        ys = ys['y']
        self.classifier.fit(xs, ys)

    def predict(self, xs):
        xs = xs.values
        ys = self.classifier.predict(xs)
        return ys
Пример #15
0
class SVMBag(DMCClassifier):
    classifier = None
    estimators = 10
    max_features = .5
    max_samples = .5

    def __init__(self, X: csr_matrix, Y: np.array, tune_parameters=False):
        super().__init__(X, Y, tune_parameters)
        self.X, self.Y = X.toarray(), Y
        self.classifier = SVC(decision_function_shape='ovo')
        self.clf = BaggingClassifier(self.classifier, n_estimators=self.estimators, n_jobs=8,
                                     max_samples=self.max_samples, max_features=self.max_features)

    def predict(self, X: csr_matrix):
        X = X.toarray()
        return self.clf.predict(X)
class BaggingClassifier(Classifier):
    def __init__(self, matrixdatabase):
        self._matrix_database = matrixdatabase
        self._has_fit = False
        self._bc = BC(n_estimators=10)

    def learn(self, ingredients, cuisine):
        return

    def classify(self, ingredients):
        if not self._has_fit:
            matrix, classes = self._matrix_database.make_train_matrix()
            self._bc = self._bc.fit(matrix, classes)
            print "Fitting complete..."
            self._has_fit = True
        output = self._bc.predict(self._matrix_database.make_row_from_recipe(ingredients))
        return output[0]
Пример #17
0
def make_bagging_test():
    
    from sklearn.ensemble import BaggingClassifier
    
    x,y,dates,movies = load_data()
    
    x =  add_missed_value_indicator(x)   
                
    test_x, train_x, test_y, train_y = create_test_train_set(x, y)      
    
    clf = BaggingClassifier(n_estimators=100, max_features=1.0,\
        max_samples=0.8).fit(train_x, train_y.ix[:,0])
    
    pred = clf.predict(test_x)   
    
    print "mse:", np.sqrt(np.mean((pred-test_y.ix[:,0])**2))
    
    
    return  pred
Пример #18
0
def RandomNbSGD(data_train,labels_train,data_test,labels_test,show_infos,n_estima=10):
    from sklearn.ensemble import BaggingClassifier
    from sklearn.linear_model import SGDClassifier as SGD
    from sklearn import cross_validation

    t1 = time()
    base_model = SGD(loss = 'modified_huber')
    # n_estimator = 100 pour perf max
    clf = BaggingClassifier(base_estimator=base_model, n_estimators=n_estima)
    y_score3 = clf.fit(data_train, labels_train)
    labels_predicted= clf.predict(data_test)
    t2=time() -t1
    
    if(show_infos == True):
        print "-------------------Vectorizing and fitting the SGD with a modified_huber loss took %s"%t2,"sec---------------"
        print "classification report"
        print classification_report(labels_test, labels_predicted)
        print "the accuracy score on the test data is :", accuracy_score(labels_test, labels_predicted)
        scores = cross_validation.cross_val_score(clf, data_train, labels_train, cv=5)
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Пример #19
0
def svm_class_and_score(
    X_train, y_train, X_test, y_test, labels, search_type=RandomizedSearchCV,
    parameter_space={
        'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['auto', 1e-3, 1e-4],
        'C': [0.01, .1, 1, 10, 100, 1000],
        'class_weight': [
            {0: 0.01}, {1: 1}, {1: 2}, {1: 10}, {1: 50}, 'balanced']},
        score='recall_weighted', iid=True, bagged=False, svm_results=True):
    """Build an SVM and return its scoring metrics
    """
    print("# Tuning hyper-parameters for %s" % score)
    print()

    # Find the Hyperparameters
    clf = search_type(SVC(C=1), parameter_space, cv=10,
                      scoring=score, iid=iid)

    # Build the SVM
    clf.fit(X_train, y_train)
    print("Hyperparameters found:")
    print(clf.best_params_)

    # Make the predictions
    y_pred = clf.predict(X_test)
    print()
    print()
    print("Results for basic SVM")
    clf_scoring(y_test, y_pred, labels)

    if bagged is True:
        bgg = BaggingClassifier(base_estimator=clf)
        bgg.fit(X_train, y_train)
        y_pred = bgg.predict(X_test)
        print()
        print()
        print("Results for bagging:")
        clf_scoring(y_test, y_pred, labels)
        return clf, bgg
    else:
        return clf
Пример #20
0
                                       ).fit(x_local_train, y_local_train)
            else:
                vprint(verbose, "[-] task not recognized")
                break
            vprint(verbose, "[+] Fitting success, time spent so far %5.2f sec"
                   % (time.time() - start))

            # Make predictions on local validation set
            if task == 'binary.classification':
                y_local_valid_pred = M.predict_proba(x_local_valid)[:, 1]
            elif task == 'multiclass.classification':
                y_local_valid_pred = M.predict_proba(x_local_valid).T
            elif task == 'multilabel.classification':
                y_local_valid_pred = np.array([Ms[i].predict_proba(x_local_valid)[:, 1] for i in range(K)]).T
            elif task == 'regression':
                y_local_valid_pred = M.predict(x_local_valid)

            # Local validation
            # x_local_valid, y_local_valid
            metric_type = D.info['metric']

            if 'f1_metric' == metric_type:
                metric = f1_metric(y_local_valid, y_local_valid_pred)
            elif 'r2_metric' == metric_type:
                metric = r2_metric(y_local_valid, y_local_valid_pred)
            elif 'bac_metric' == metric_type:
                metric = bac_metric(y_local_valid, y_local_valid_pred)
            elif 'auc_metric' == metric_type:
                metric = auc_metric(y_local_valid, y_local_valid_pred)
            elif 'pac_metric' == metric_type:
                metric = pac_metric(y_local_valid, y_local_valid_pred)
Пример #21
0
print(
    classification_report(Test_Y,
                          ada_predictions_valid,
                          target_names=target_names))

plot_confusion_matrix(Test_Y,
                      ada_predictions_valid,
                      class_names,
                      title='Confusion matrix, without normalization')
plt.show()

#bagging
bag = BaggingClassifier(n_estimators=100,
                        base_estimator=clf,
                        max_samples=0.5,
                        max_features=1.0)
bag.fit(Train_X_Count, Train_Y)
bag_predictions_valid = bag.predict(Test_X_Count)

print("Bagging Score -> ", accuracy_score(bag_predictions_valid, Test_Y) * 100)
print(
    classification_report(Test_Y,
                          bag_predictions_valid,
                          target_names=target_names))

plot_confusion_matrix(Test_Y,
                      bag_predictions_valid,
                      class_names,
                      title='Confusion matrix, without normalization')
plt.show()
Пример #22
0
f1 = 2*(precision*recall)/(precision + recall)

print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1: " + str(f1))

tn, fp, fn, tp = confusion_matrix(y_test, final_pred).ravel()
print(classification_report(y_test, final_pred))

bg = BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5, max_features=1.0, n_estimators=20)
bg.fit(x_train,y_train)
bg.score(x_train,y_train)
bg.score(x_test,y_test)
print('train bagging score: ', bg.score(x_train,y_train))
print('test bagging score: ', bg.score(x_test,y_test))
final_pred = bg.predict(x_test)

from sklearn.metrics import roc_curve, classification_report
from sklearn.metrics import auc

false_positive, true_positive, threshold = roc_curve(y_test, final_pred)
roc_auc = auc(false_positive, true_positive)

plt.title('Receiver Operating Characteristic')
plt.plot(false_positive, true_positive, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
Пример #23
0
    def on_click(self):
        df = pd.read_csv('training-data/Well-all.csv')
        df.dropna(inplace=True)
        x = np.array(df.drop(['Lithology'], 1))
        y = np.array(df['Lithology'])
        x_train, x_test, y_train, y_test = model_selection.train_test_split(
            x, y, test_size=0.20)  #20% test data

        clf = BaggingClassifier(neighbors.KNeighborsClassifier(),
                                max_samples=0.5,
                                max_features=0.5)
        clf.fit(x_train, y_train)
        KNN = clf.score(x_test, y_test)

        self.textbox.setText(f"{KNN*100:1.4f} %")
        with open("output/KNNprediction.csv", "w") as f:
            f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n")
            df2 = pd.read_csv('10thwell/Well-10_log_data.csv')
            a = np.array(df2.drop(['LITHOLOGY'], 1))
            for sample in a:
                example_measures = np.array(
                    [sample[0], sample[1], sample[2], sample[3]])
                example_measures = example_measures.reshape(1, -1)
                prediction = clf.predict(example_measures)
                f.write(
                    f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n"
                )
        '''--------------------------------RF------------------------------------'''
        clf = RandomForestClassifier(n_estimators=100)
        clf.fit(x_train, y_train)
        RF = clf.score(x_test, y_test)
        self.textbox1.setText(f"{RF*100:1.4f} %")

        with open("output/RFprediction.csv", "w") as f:
            f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n")
            df2 = pd.read_csv('10thwell/Well-10_log_data.csv')
            a = np.array(df2.drop(['LITHOLOGY'], 1))
            for sample in a:
                example_measures = np.array(
                    [sample[0], sample[1], sample[2], sample[3]])
                example_measures = example_measures.reshape(1, -1)
                prediction = clf.predict(example_measures)
                f.write(
                    f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n"
                )
        '''----------------------------------GNB--------------------------------------------'''
        clf = GaussianNB()
        clf.fit(x_train, y_train)
        NB = clf.score(x_test, y_test)
        self.textbox2.setText(f"{NB*100:1.4f} %")
        with open("output/NBprediction.csv", "w") as f:
            f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n")
            df2 = pd.read_csv('10thwell/Well-10_log_data.csv')
            a = np.array(df2.drop(['LITHOLOGY'], 1))
            for sample in a:
                example_measures = np.array(
                    [sample[0], sample[1], sample[2], sample[3]])
                example_measures = example_measures.reshape(1, -1)
                prediction = clf.predict(example_measures)
                f.write(
                    f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n"
                )
        '''---------------------------------------DECISION TREE ---------------------------------------'''
        clf = DecisionTreeClassifier(criterion="gini",
                                     random_state=100,
                                     max_depth=3,
                                     min_samples_leaf=5)
        clf.fit(x_train, y_train)
        DT = clf.score(x_test, y_test)
        self.textbox3.setText(f"{DT*100:1.4f} %")
        with open("output/DecisionTreeprediction.csv", "w") as f:
            f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n")
            df2 = pd.read_csv('10thwell/Well-10_log_data.csv')
            a = np.array(df2.drop(['LITHOLOGY'], 1))
            for sample in a:
                example_measures = np.array(
                    [sample[0], sample[1], sample[2], sample[3]])
                example_measures = example_measures.reshape(1, -1)
                prediction = clf.predict(example_measures)
                f.write(
                    f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n"
                )
        '''------------------------------------------LR-----------------------------------------------'''
        reg = LogisticRegression()
        reg.fit(x_train, y_train)
        LR = reg.score(x_test, y_test)
        self.textbox4.setText(f"{LR*100:1.4f} %")
        with open("output/LRprediction.csv", "w") as f:
            f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n")
            df2 = pd.read_csv('10thwell/Well-10_log_data.csv')
            a = np.array(df2.drop(['LITHOLOGY'], 1))
            for sample in a:
                example_measures = np.array(
                    [sample[0], sample[1], sample[2], sample[3]])
                example_measures = example_measures.reshape(1, -1)
                prediction = reg.predict(example_measures)
                f.write(
                    f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n"
                )
        '''------------------------------------------SVM-----------------------------------------------'''
        clf = svm.SVC(gamma='auto')
        clf.fit(x_train, y_train)
        SM = clf.score(x_test, y_test)
        self.textbox5.setText(f"{SM*100:1.4f} %")
        with open("output/SVMprediction.csv", "w") as f:
            f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n")
            df2 = pd.read_csv('10thwell/Well-10_log_data.csv')
            a = np.array(df2.drop(['LITHOLOGY'], 1))
            for sample in a:
                example_measures = np.array(
                    [sample[0], sample[1], sample[2], sample[3]])
                example_measures = example_measures.reshape(1, -1)
                prediction = clf.predict(example_measures)
                f.write(
                    f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n"
                )
        '''-----------------------------------------Result--------------------------------------------------'''

        best = ""

        if KNN > RF and KNN > LR and KNN > SM and KNN > DT and KNN > NB:
            best = f"K- Nearest Neighbours with Accuracy : {KNN*100:1.4f} %"
        elif RF > KNN and RF > LR and RF > SM and RF > DT and RF > NB:
            best = f"Random Forest with Accuracy : {RF*100:1.4f} %"
        elif LR > RF and LR > KNN and LR > SM and LR > DT and LR > NB:
            best = f"Logistic Regression with Accuracy : {LR*100:1.4f} %"
        elif SM > RF and SM > KNN and SM > LR and SM > DT and SM > NB:
            best = f"Support Vector Machine with Accuracy : {SM*100:1.4f} %"
        elif DT > RF and DT > KNN and DT > SM and DT > LR and DT > NB:
            best = f"Decision Tree with Accuracy : {DT*100:1.4f} %"
        else:
            best = f"Naive Bayes with Accuracy : {NB*100:1.4f} %"

        self.textbox6.setText(best)
Пример #24
0
# In[ ]:

from sklearn.metrics import accuracy_score
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print("decision tree train/test accuracies {} / {}".format(
    tree_train, tree_test))

# In[ ]:

bag = bag.fit(X_train, y_train)
y_trian_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)
bag_train = accuracy_score(y_train, y_train_pred)
bag_test = accuracy_score(y_true=y_test, y_pred=y_test_pred)
print(bag_test)
print("bagging train/test accuracies {} / {}".format(bag_train, bag_test))

# In[ ]:

X_min = X_train[:, 0].min() - 1
X_max = X_train[:, 0].max() + 1
y_min = X_train[:, 1].min() - 1
y_max = X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(X_min, X_max, 0.1),
                     np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(nrows=1,
Пример #25
0
# training scores
print "Training scores..."
print bdt.score(x_train, y_train)
print bagged.score(x_train, y_train)
print rfc.score(x_train, y_train)

# score the classfier on the test set
# print "Scoring..."
# print bdt.score(x_test, y_test)
# print bagged.score(x_test, y_test)
# print rfc.score(x_test, y_test)

# print "Writing predictions..."
predictions1 = bdt.predict(x_test)
predictions2 = bagged.predict(x_test)
predictions3 = rfc.predict(x_test)
predictions = []

for i in range(100):
    if predictions1[i] + predictions2[i] + predictions3[i] > 1:
        predictions.append(1)
    else:
        predictions.append(0)

f = open('/Users/LeiyaMa/Desktop/binary/predictions.csv', 'w')
f.write('SID,Label\n')
for i in range(100):
    f.write('Sbj' + str(i + 1) + ',' + str(int(predictions[i])) + '\n')

################################################################################
Пример #26
0
y=data[:,-1]

acc=[]
kf=KFold(n_splits=10)
i=0
tp=[]
tn=[]
fp=[]
fn=[]
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = linear_model.LogisticRegression()
    model = BaggingClassifier(base_estimator=clf,n_estimators=10,max_features=24)
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)
    acc=acc+[metrics.accuracy_score(y_test,y_pred)*100]
    tp=tp+[metrics.confusion_matrix(y_test,y_pred)[0][0]]
    tn=tn+[metrics.confusion_matrix(y_test,y_pred)[1][1]]
    fp=fp+[metrics.confusion_matrix(y_test,y_pred)[1][0]]
    fn=fn+[metrics.confusion_matrix(y_test,y_pred)[0][1]]
    
acc=np.array(acc)
tp=np.array(tp)
tn=np.array(tn)
fp=np.array(fp)
fn=np.array(fn)
print("Accuracy",acc.mean())
print('tp',tp.mean())
print('tn',tn.mean())
print('fp',fp.mean())
Пример #27
0
softXValScore

# =============================================================================
# =============================================================================
# =============================================================================
# # # Bagging 
# =============================================================================
# =============================================================================
# =============================================================================


bc = BaggingClassifier(base_estimator=vc, n_estimators=300, n_jobs=-1)

bc.fit(X_train, y_train)

y_pred = bc.predict(X_test)


calculateTestAccuracy(bc)
calculateTrainAccuracy(bc)










Пример #28
0
def model(boosting_name, data_name, classifier_name, cv_name, mode):
    """
    模板方法
    :param boosting_name: 集成学习的方法
    :param data_name: 数据集名称
    :param classifier_name: 使用的基分类器
    :param cv_name: 交叉验证模式
    :param mode: 采样模式
    :return:
    """
    # 加载数据
    if data_name in fetch_datasets().keys():
        dataset = fetch_datasets()[data_name]
        X = dataset.data
        y = dataset.target
        print(Counter(y))
    else:
        # 加载自定义数据
        df = pd.read_csv('../imbalanced_data/%s.csv' % data_name, header=None)
        array = df.values.astype(float)
        X = array[:, 0:array.shape[1] - 1]
        y = array[:, -1]
        print(Counter(y))
    base = None
    if classifier_name == 'CART':
        base = tree.DecisionTreeClassifier(max_depth=8,
                                           random_state=42,
                                           min_samples_split=10)
    elif classifier_name == 'svm':
        base = svm.SVC()
    else:
        pass
    # 起始时间
    start_time = time.time()
    cv = None
    if cv_name == 'StratifiedKFold':
        cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    elif cv_name == 'RepeatedStratifiedKFold':
        cv = RepeatedStratifiedKFold(n_repeats=10,
                                     n_splits=10,
                                     random_state=42)
    else:
        pass
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)  # 插值点(保证每一折的fpr和tpr相同)
    aucs = []
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        classifier = None
        if boosting_name == 'CART':
            classifier = base
        elif boosting_name == 'Bagging':
            classifier = BaggingClassifier(base_estimator=base,
                                           n_estimators=40)
        elif boosting_name == 'BalancedBagging':
            classifier = BalancedBaggingClassifier(base_estimator=base,
                                                   ratio='auto',
                                                   replacement=True,
                                                   random_state=42)
        elif boosting_name == 'Adaboost':
            classifier = AdaBoostClassifier(base_estimator=base,
                                            n_estimators=40)
        elif boosting_name == 'Random Forest':
            classifier = RandomForestClassifier(max_depth=8,
                                                min_samples_split=10,
                                                n_estimators=40,
                                                random_state=42)
        elif boosting_name == 'EasyEnsemble':
            model_under(boosting_name, X_train_minmax, y[train], X_test_minmax,
                        y[test])
            continue
        elif boosting_name == 'BalanceCascade':
            model_under(boosting_name, X_train_minmax, y[train], X_test_minmax,
                        y[test])
            continue
        elif boosting_name == 'SMOTEBoost':
            classifier = SMOTEBoost(rate=100,
                                    n_estimators=40,
                                    weak_estimator=base,
                                    random_state=42,
                                    class_dist=False)
        elif boosting_name == 'RUSBoost':
            classifier = RUSBoost(ratio=50,
                                  n_estimators=40,
                                  weak_estimator=base,
                                  random_state=42,
                                  class_dist=False)
        else:
            pass
        classifier.fit(X_train_minmax, y[train])  # 采样
        predict = classifier.predict(X_test_minmax)
        probability = classifier.predict_proba(X_test_minmax)[:, 1]
        # 指标计算
        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        accuracy = metrics.accuracy_score(y[test], predict)
        # -------------step6.计算每一折的ROC曲线和PR曲线上的点 -------------
        fpr, tpr, thresholds = metrics.roc_curve(y[test], probability)
        # 对mean_tpr在mean_fpr处进行插值,通过scipy包调用interp()函数
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0  # 为什么?
        roc_auc = metrics.auc(fpr, tpr)
        aucs.append(roc_auc)
        # write2dic
        fill_dic('precision', boosting_name, precision)
        fill_dic('recall', boosting_name, recall)
        fill_dic('f1', boosting_name, f1)
        fill_dic('auc', boosting_name, auc)
        fill_dic('gmean', boosting_name, gmean)

    if boosting_name != 'EasyEnsemble' and boosting_name != 'BalanceCascade':
        # 将frp和tpr写入文件
        # 在mean_fpr100个点,每个点处插值插值多次取平均
        mean_tpr /= cv.get_n_splits()
        # 坐标最后一个点为(1,1)
        mean_tpr[-1] = 1.0
        # 计算平均AUC值
        mean_auc = metrics.auc(mean_fpr, mean_tpr)

        # 将平均fpr和tpr拼接起来存入文件
        filename = './ROC/{data_name}/{mode}/{base_classifier}/{sampler}.csv'. \
            format(data_name=data_name, mode=mode, base_classifier=classifier_name, sampler=boosting_name)
        # 将文件路径分割出来
        file_dir = os.path.split(filename)[0]
        # 判断文件路径是否存在,如果不存在,则创建,此处是创建多级目录
        if not os.path.isdir(file_dir):
            os.makedirs(file_dir)
        # # 然后再判断文件是否存在,如果不存在,则创建
        # if not os.path.exists(filename):
        #     os.system(r'touch %s' % filename)
        # 将结果拼合起来
        all = np.c_[mean_fpr, mean_tpr]
        np.savetxt(filename, all, delimiter=',', fmt='%f')

    print('%s building id transforming took %fs!' %
          (boosting_name, time.time() - start_time))
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
    """
    algo = BaggingClassifier(base_estimator=dtree,
                             n_estimators=10,
                             oob_score=True)
    # 模型训练
    algo.fit(X_train, Y_train)
    # 模型效果评估
    print('训练集上的准确率:{}'.format(algo.score(X_train, Y_train)))
    print('测试集上的准确率:{}'.format(algo.score(X_test, Y_test)))
    # 查看下API属性
    X_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]]
    print('样本的预测值:')
    print(algo.predict(X_test))
    print('样本预测值概率:')
    print(algo.predict_log_proba(X_test))
    print('样本预测概率值的Log转换:')
    print(algo.predict_log_proba(X_test))
    # print('训练好的所有子模型:{}'.format(algo.estimators_))

    for index, estimators in enumerate(algo.estimators_):
        print('第{}个子模型对于数据的预测值为:{}'.format(index + 1, algo.predict(X_test)))
    # 就是有放回的抽样获取的数据子集
    print('每个子模型的训练数据:\n{}'.format(algo.estimators_samples_))
    print('每个子模型的训练数据使用的特征属性:\n{}'.format(algo.estimators_features_))
    print('Bagging模型的袋外准确率:\n{}'.format(algo.oob_score_))

    # 所有子模型可视化
    for index, estimators in enumerate(algo.estimators_):
def machineRun(balancing):
    texts1, labels, pmids1 = _load_data(
        '../output_data/proton-beam-merged.csv')
    classifiers = {}
    labels = []
    texts = []
    pmids = []

    getcrowdvotequestion = crowd_main(
        0)  # change the label with first question label!
    for item in getcrowdvotequestion.keys():
        pmids.append(item)
    for item in pmids:
        labels.append(getcrowdvotequestion[item])
    for item in pmids:
        index = pmids1.index(item)
        texts.append(texts1[index])

    if (balancing > 0):
        Outscope = [i for i, j in list(enumerate(labels))
                    if j == 0]  # get index
        Inscope = [i for i, j in list(enumerate(labels))
                   if j == 1]  # get index
        sample = len(Inscope) * balancing
        candid = random.sample(Outscope, sample)  # random sample from out
        texts = [j for i, j in list(enumerate(texts)) if i in Inscope
                 ] + [j for i, j in list(enumerate(texts)) if i in candid]
        labels = [j for i, j in list(enumerate(labels)) if i in Inscope
                  ] + [j for i, j in list(enumerate(labels)) if i in candid]
        pmids = [j for i, j in list(enumerate(pmids)) if i in Inscope
                 ] + [j for i, j in list(enumerate(pmids)) if i in candid]

    vectorizer = TfidfVectorizer(stop_words="english",
                                 min_df=3,
                                 max_features=50000,
                                 norm='l2')
    X = vectorizer.fit_transform(texts)
    X = X.toarray()
    y = np.array(labels)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=42,
                                                        stratify=y,
                                                        test_size=0.5)
    result = []

    # Machine 1 DummyClassifier
    print('DummyClassifier_stratified')
    Random_classifier = DummyClassifier(strategy='stratified',
                                        random_state=42).fit(X_train, y_train)
    y_pred = Random_classifier.predict(X_test)
    classifiers['0'] = y_pred
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    accuracy_train = Random_classifier.score(X_train, y_train)
    accuracy_test = Random_classifier.score(X_test, y_test)
    f1score = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test, y_pred)
    result.append([
        'DumClassifierStratified', accuracy_train, accuracy_test, f1score, roc,
        precision, recall
    ])
    print('accuracy_train:' + str(accuracy_train))
    print('accuracy_test:' + str(accuracy_test))
    print('f1score:' + str(f1score))
    print('roc_auc_score:' + str(roc))
    print('recall:' + str(recall))
    print('precision:' + str(precision))

    # Machine 1 DummyClassifier
    print('DummyClassifier_stratified')
    Random1_classifier = DummyClassifier(strategy='most_frequent',
                                         random_state=42).fit(
                                             X_train, y_train)
    y_pred = Random1_classifier.predict(X_test)
    classifiers['1'] = y_pred
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    accuracy_train = Random1_classifier.score(X_train, y_train)
    accuracy_test = Random1_classifier.score(X_test, y_test)
    f1score = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test, y_pred)
    result.append([
        'DumClassifierMostfrequent', accuracy_train, accuracy_test, f1score,
        roc, precision, recall
    ])
    print('accuracy_train:' + str(accuracy_train))
    print('accuracy_test:' + str(accuracy_test))
    print('f1score:' + str(f1score))
    print('roc_auc_score:' + str(roc))
    print('recall:' + str(recall))
    print('precision:' + str(precision))

    # Machine 1 NaiveBase
    print('Machine 1 MultinomialNaiveBase')
    gs_NaiveBase_clf = MultinomialNB().fit(X_train, y_train)
    y_pred = gs_NaiveBase_clf.predict(X_test)
    classifiers['2'] = y_pred
    accuracy_train = gs_NaiveBase_clf.score(X_train, y_train)
    accuracy_test = gs_NaiveBase_clf.score(X_test, y_test)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1score = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test, y_pred)
    result.append([
        'MultinomialNB', accuracy_train, accuracy_test, f1score, roc,
        precision, recall
    ])
    print('accuracy_train:' + str(accuracy_train))
    print('accuracy_test:' + str(accuracy_test))
    print('f1score:' + str(f1score))
    print('roc_auc_score:' + str(roc))
    print('recall:' + str(recall))
    print('precision:' + str(precision))

    # Machine 1 BeNaiveBase
    print('Machine 1 BernoulliNB')
    gs_NaiveBase_clf = BernoulliNB().fit(X_train, y_train)
    y_pred = gs_NaiveBase_clf.predict(X_test)
    classifiers['3'] = y_pred
    accuracy_train = gs_NaiveBase_clf.score(X_train, y_train)
    accuracy_test = gs_NaiveBase_clf.score(X_test, y_test)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1score = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test, y_pred)
    result.append([
        'BernoulliNB', accuracy_train, accuracy_test, f1score, roc, precision,
        recall
    ])
    print('accuracy_train:' + str(accuracy_train))
    print('accuracy_test:' + str(accuracy_test))
    print('f1score:' + str(f1score))
    print('roc_auc_score:' + str(roc))
    print('recall:' + str(recall))
    print('precision:' + str(precision))

    # Machine 2 SGD Norm2
    print('Machine 2 SGD')
    params_d = {"alpha": 10.0**-np.arange(1, 7)}
    sgd = SGDClassifier(class_weight={1: 2}, random_state=42, penalty='l2')
    clfsgd = GridSearchCV(sgd, params_d, scoring='roc_auc', cv=3)
    clfsgd = clfsgd.fit(X_train, y_train)
    y_pred = clfsgd.predict(X_test)
    classifiers['4'] = y_pred
    accuracy_train = clfsgd.score(X_train, y_train)
    accuracy_test = clfsgd.score(X_test, y_test)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1score = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test, y_pred)
    result.append([
        'SGDl2{1:2}', accuracy_train, accuracy_test, f1score, roc, precision,
        recall
    ])
    print('accuracy_train:' + str(accuracy_train))
    print('accuracy_test:' + str(accuracy_test))
    print('f1score:' + str(f1score))
    print('roc_auc_score:' + str(roc))
    print('recall:' + str(recall))
    print('precision:' + str(precision))

    # Machine 2 SGD Norm1
    print('Machine 3 SGD')
    sgd = SGDClassifier(class_weight={1: 1}, random_state=42, penalty='l1')
    clfsgd = GridSearchCV(sgd, params_d, scoring='roc_auc', cv=3)
    clfsgd = clfsgd.fit(X_train, y_train)
    y_pred = clfsgd.predict(X_test)
    classifiers['5'] = y_pred
    accuracy_train = clfsgd.score(X_train, y_train)
    accuracy_test = clfsgd.score(X_test, y_test)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1score = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test, y_pred)
    result.append([
        'SGDl1{1:1}', accuracy_train, accuracy_test, f1score, roc, precision,
        recall
    ])
    print('accuracy_train:' + str(accuracy_train))
    print('accuracy_test:' + str(accuracy_test))
    print('f1score:' + str(f1score))
    print('roc_auc_score:' + str(roc))
    print('recall:' + str(recall))
    print('precision:' + str(precision))

    # Machine 3 RandomForrest

    print('Machine 4 RandomForrest')
    RF_clf = RandomForestClassifier(class_weight={1: 5}, random_state=42)
    parameters_RF = {
        'n_estimators': [300],  # 300 is enough
        'max_depth': [20]  # this is good fit
    }

    gs_RF_clf = GridSearchCV(RF_clf,
                             parameters_RF,
                             n_jobs=-1,
                             scoring='roc_auc',
                             cv=3)
    gs_RF_clf = gs_RF_clf.fit(X_train, y_train)
    print('RF fitted!')
    y_pred = gs_RF_clf.predict(X_test)
    classifiers['6'] = y_pred
    accuracy_train = gs_RF_clf.score(X_train, y_train)
    accuracy_test = gs_RF_clf.score(X_test, y_test)
    f1score = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    result.append([
        'RF{1:5}', accuracy_train, accuracy_test, f1score, roc, precision,
        recall
    ])
    print('accuracy_train:' + str(accuracy_train))
    print('accuracy_test:' + str(accuracy_test))
    print('f1score:' + str(f1score))
    print('roc_auc_score:' + str(roc))
    print('recall:' + str(recall))
    print('precision:' + str(precision))
    #
    # Machine 4 KNN
    print('Machine 5 KNN')
    knn_clf = KNeighborsClassifier(weights='uniform')

    parameters_knn = {'n_neighbors': [2, 3, 4]}
    gs_knn_clf = GridSearchCV(knn_clf,
                              parameters_knn,
                              scoring='roc_auc',
                              n_jobs=-1,
                              cv=3)
    gs_knn_clf = gs_knn_clf.fit(X_train, y_train)
    y_pred = gs_knn_clf.predict(X_test)
    accuracy_train = gs_knn_clf.score(X_train, y_train)
    accuracy_test = gs_knn_clf.score(X_test, y_test)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1score = metrics.f1_score(y_test, y_pred)
    roc = metrics.roc_auc_score(y_test, y_pred)
    result.append([
        'KNN', accuracy_train, accuracy_test, f1score, roc, precision, recall
    ])
    print('accuracy_train:' + str(accuracy_train))
    print('accuracy_test:' + str(accuracy_test))
    print('f1score:' + str(f1score))
    print('roc_auc_score:' + str(roc))
    print('recall:' + str(recall))
    print('precision:' + str(precision))
    # #
    # # Machine 4 GB
    print('Machine 6 GB')
    GB_clf = GradientBoostingClassifier(random_state=42, max_features=0.1)

    parameters_GB = {'n_estimators': [200], 'learning_rate': [0.1]}

    gb_clf = GridSearchCV(GB_clf,
                          parameters_GB,
                          scoring='roc_auc',
                          n_jobs=-1,
                          cv=3)
    gb_clf = gb_clf.fit(X_train, y_train)
    print('GB fitted!')
    y_pred = gb_clf.predict(X_test)
    classifiers['7'] = y_pred
    accuracy_train = gb_clf.score(X_train, y_train)
    accuracy_test = gb_clf.score(X_test, y_test)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1score = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test, y_pred)
    result.append(
        ['GB', accuracy_train, accuracy_test, f1score, roc, precision, recall])
    print('accuracy_train:' + str(accuracy_train))
    print('accuracy_test:' + str(accuracy_test))
    print('f1score:' + str(f1score))
    print('roc_auc_score:' + str(roc))
    print('recall:' + str(recall))
    print('precision:' + str(precision))

    # Machine 4 GB

    print('Machine 6 baggingWithSVC')
    n_estimators = 10
    SVC_clf = BaggingClassifier(base_estimator=SVC(kernel='linear',
                                                   class_weight={1: 10}),
                                n_estimators=n_estimators,
                                max_samples=1.0 / n_estimators,
                                random_state=42,
                                max_features=0.3)

    SVC_clf = SVC_clf.fit(X_train, y_train)
    print('baggingWithSVC fitted!')

    y_pred = SVC_clf.predict(X_test)
    classifiers['8'] = y_pred
    accuracy_train = SVC_clf.score(X_train, y_train)
    accuracy_test = SVC_clf.score(X_test, y_test)
    f1score = metrics.f1_score(y_test, y_pred, average='macro')
    roc = metrics.roc_auc_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    result.append([
        'SVCBagging{1:10}', accuracy_train, accuracy_test, f1score, roc,
        precision, recall
    ])
    print('accuracy_train:' + str(accuracy_train))
    print('accuracy_test:' + str(accuracy_test))
    print('f1score:' + str(f1score))
    print('roc_auc_score:' + str(roc))
    print('recall:' + str(recall))
    print('precision:' + str(precision))
    print('*******************************')

    return result, classifiers, y_test
                            n_estimators=50,
                            max_samples=1.0,
                            max_features=1.0,
                            bootstrap=True,
                            bootstrap_features=False,
                            n_jobs=-1,
                            random_state=42)

# In[ ]:

bag_clf.fit(X_train, y_train.ravel())

# In[ ]:

print_score(bag_clf, X_train, y_train, X_test, y_test, train=True)
print_score(bag_clf, X_train, y_train, X_test, y_test, train=False)

# In[ ]:

Y_pred = bag_clf.predict(test_df.drop('PassengerId', axis=1))

Y_pred

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": Y_pred
})
submission.to_csv('submissions_bag_last.csv', index=False)

# In[ ]:
Пример #32
0
hw5_run_test.py

This program runs the identified best classifier on the test dataset
Bagging w/ Decision Trees (31 estimators)

@author: HyunJae Pi, [email protected]
"""

import numpy as np
import pandas as pd
from sklearn import preprocessing  #from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

# training data
df0 = pd.read_csv("./training2b.csv", header=None)
n_features = df0.shape[1] - 1
X_training = preprocessing.scale(df0.loc[:, 0:n_features - 1].values)
y_training = df0.loc[:, n_features].values

# test data
df1 = pd.read_csv("./test2b.csv", header=None)
X_test = preprocessing.scale(df1.loc[:, 0:n_features - 1].values)

clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                        n_estimators=31).fit(X_training, y_training)
y_test = clf.predict(X_test).astype(int)

# save
np.savetxt('./hw5_prediction.txt', y_test, fmt='%d')
Пример #33
0
X, y = datasets.fetch_covtype(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Scaling data
scaler = StandardScaler() 
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)  

n_neighbors = 5
modelo = KNeighborsClassifier(n_neighbors)
modelo.fit(X_train,y_train)
print(modelo.predict(X_test))

# Bagging 

modeloB = BaggingClassifier(KNeighborsClassifier(n_neighbors), max_samples=0.3, max_features=0.3)
modeloB.fit(X_train,y_train)
print(modeloB.predict(X_test))

# Bagging 2
modeloB2 = BaggingClassifier(n_estimators=10, max_samples=0.3, max_features=0.3)
modeloB2.fit(X_train,y_train)
print(modeloB2.predict(X_test))

print(y_test)

print(modelo.score(X_test,y_test))
print(modeloB.score(X_test,y_test))
print(modeloB2.score(X_test,y_test))
Пример #34
0
print(n_correct / len(y_pred))

# In[72]:

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# In[73]:

bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                            n_estimators=500,
                            max_samples=100,
                            bootstrap=True,
                            n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_train)

# In[74]:

bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                            n_estimators=500,
                            oob_score=True,
                            bootstrap=True,
                            n_jobs=-1)

# In[75]:

bag_clf.fit(X_train, y_train)

# In[76]:
Пример #35
0
			for clf in best_pool:
				results = defaultdict(float)
				y = clf.predict(x_test.reshape(1, -1))
				results[y[0]] += 1

			y = max(results.iteritems(), key=operator.itemgetter(1))[0]
			y_pred.append(y)

		return y_pred

if __name__ == '__main__':
	X, y = make_classification(n_samples=1000, n_features=20, class_sep=0.7, flip_y=0.03)
	x_test, y_test = X[0], y[0]
	X, y = X[1:], y[1:]


	X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
	X_train, X_test, y_train, y_test = train_test_split(X_train, y_train \
	                                   , test_size=0.2)
	
	bag = BaggingClassifier(n_estimators=200)
	bag.fit(X_train, y_train)
	knora = KNORA(ensemble_clf=bag, knn=8, X_val=X_val, y_val=y_val)
	meta = IbepMlc(ensemble_clf=bag, knn=8, X_val=X_val, y_val=y_val)

	print accuracy_score(bag.predict(X_test), y_test)
	print accuracy_score(knora.predict(X_test), y_test)
	print accuracy_score(meta.predict(X_test), y_test)


Пример #36
0
                       n_estimators=1500)
results = model_selection.cross_val_score(bg, X_final_train, y, cv=5)
print(results.mean())
# print(bg.score(X_final_train, y))
# Timer stops
stop = timeit.default_timer()
print("Time Execution: {}".format(stop - start))
#------------------------------End of Baggin classifier----------------------

#-----------------------------FINAL TEST PURPOSE ONLY-----------------------
X_final_train_cv = stemmed_cv.fit_transform(X)
X_final_train = tfidf_vectorizer.fit_transform(X_final_train_cv)

df_final = pd.read_csv("reddit_test.csv")
X_final_test = df_final["comments"].values
X_final_test_cv = stemmed_cv.transform(X_final_test)
X_final_test = tfidf_vectorizer.transform(X_final_test_cv)

# mnb.fit(X_final_train, y)
# y_final = mnb.predict(X_final_test)
bg.fit(X_final_train, y)
y_final = bg.predict(X_final_test)

predict_arr = np.c_[df_final["id"], y_final]
predict_dataset = pd.DataFrame({
    "Id": predict_arr[:, 0],
    "Category": predict_arr[:, 1]
})
predict_dataset.to_csv("out_mnb2.csv", index=False)
#--------------------------END OF FINAL TEST-----------------------------------
Пример #37
0
print(roc_score)

# Code ends here

# --------------
# Import Bagging Classifier
from sklearn.ensemble import BaggingClassifier

# Code starts here
bagging_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                n_estimators=100,
                                max_samples=100,
                                random_state=0)

bagging_clf.fit(X_train, y_train)
y_pred = bagging_clf.predict(X_test)

score_bagging = bagging_clf.score(X_test, y_test)
print(score_bagging)

roc_score = roc_auc_score(y_test, y_pred)
print(roc_score)

# Code ends here

# --------------
# Import libraries
from sklearn.ensemble import VotingClassifier

# Various models
clf_1 = LogisticRegression()
Пример #38
0
def test():
    traintokenCnt = [20000000]
    testtokenCnt = [500000,1000000,5000000,10000000,20000000]
    trainFeatDic = dict()
    c7 = set(['bg','ca','de','el','hu','tr','hi'])
    for totaltoken  in traintokenCnt:
        c7Feature = []
        c7Label = []
        c7Word = []
        trainFeatDic[totaltoken] = dict()
        for trainlangKey in __W2cTrainCorpusDic:
            trainfilepath = '../feature/train/' + trainlangKey + '/' + str(totaltoken) + '.txt'
            trainFeatDic[totaltoken][trainlangKey] = dict()
            trainFeatDic[totaltoken][trainlangKey]['feature'] = []
            trainFeatDic[totaltoken][trainlangKey]['wordform'] = []
            trainFeatDic[totaltoken][trainlangKey]['label'] = []
            for line in open(trainfilepath):
                feat = json.loads(line.strip())
                trainFeatDic[totaltoken][trainlangKey]['feature'].append(feat['feature'])
                trainFeatDic[totaltoken][trainlangKey]['wordform'].append(feat['wordform'])
                trainFeatDic[totaltoken][trainlangKey]['label'].append(feat['label'])
                if trainlangKey in c7:
                    c7Feature.append(feat['feature'])
                    c7Label.append(feat['label'])
                    c7Word.append(feat['wordform'])
        trainFeatDic[totaltoken]['c7'] = dict()
        trainFeatDic[totaltoken]['c7']['feature'] = c7Feature
        trainFeatDic[totaltoken]['c7']['wordform'] = c7Word
        trainFeatDic[totaltoken]['c7']['label'] = c7Label

    testFeatDic = dict()
    for totaltoken in testtokenCnt:
        testFeatDic[totaltoken] = dict()
        for testlangKey in __W2cTestCorpusDic:
            testfilepath = '../feature/test/' + testlangKey + '/' + str(totaltoken) + '.txt'
            testFeatDic[totaltoken][testlangKey] = dict()
            testFeatDic[totaltoken][testlangKey]['feature'] = []
            testFeatDic[totaltoken][testlangKey]['wordform'] = []
            testFeatDic[totaltoken][testlangKey]['label'] = []
            for line in open(testfilepath):
                feat = json.loads(line.strip())
                testFeatDic[totaltoken][testlangKey]['feature'].append(feat['feature'])
                testFeatDic[totaltoken][testlangKey]['wordform'].append(feat['wordform'])
                testFeatDic[totaltoken][testlangKey]['label'].append(feat['label'])

    for traintotaltoken in trainFeatDic:
        fb = open(str(traintotaltoken) + 'result','w')
        for trainlangKey in trainFeatDic[traintotaltoken]:
            correctDic = dict()
            correctDic[trainlangKey] = dict()
            trainX = np.array(trainFeatDic[traintotaltoken][trainlangKey]['feature'])[:,:17]
            scaler = preprocessing.StandardScaler().fit(trainX)
            trainX_scaled = scaler.transform(np.array(trainX))
            trainY = trainFeatDic[traintotaltoken][trainlangKey]['label']

            #clf = BaggingClassifier(KNeighborsClassifier(), max_features=9,\
            #    bootstrap_features=True)
            clf = BaggingClassifier(svm.SVC(),max_features = 12, bootstrap_features=True)
	    clf = clf.fit(trainX_scaled,trainY)
            for testtotaltoken in testFeatDic:
                predictYDic = dict()
                for testlangKey in testFeatDic[testtotaltoken]:
                    testX = np.array(testFeatDic[testtotaltoken][testlangKey]['feature'])[:,:17]
                    testY = testFeatDic[testtotaltoken][testlangKey]['label']
                    testX_scaled = scaler.transform(np.array(testX))

                    predictY = clf.predict(testX_scaled)
                    correctCnt = 0
                    for index,labelY in enumerate(predictY):
                        if testY[index] == labelY:
                            correctCnt += 1
                    predictfilename = './predictlabel/' + trainlangKey + '_' + testlangKey + '_'\
                            + str(traintotaltoken) + '_' + str(testtotaltoken) +'.txt'
                    tmpfb = open(predictfilename,'w')
                    for index,labelY in enumerate(predictY):
                        tmpfb.write(testFeatDic[testtotaltoken][testlangKey]['wordform'][index]\
                                + '\t' + testY[index] + '\t' + '\t' + labelY + '\n')
                        tmpfb.flush()
                    tmpfb.write('\n\nout of vocabulary word prediction\n')

                    oovfile = '../feature/test/' + testlangKey + '/' + str(testtotaltoken) +'_oov.txt'
                    oovcorrectcnt = 0
                    oovtotalcnt = 0
                    for line in open(oovfile):
                        oovtotalcnt += 1
                        word,pos = line.strip().split('\t')[0],line.strip().split('\t')[1]
                        if __digitPatt.match(word) and pos == 'NUM':
                            oovcorrectcnt += 1
                            tmpfb.write(word + '\t' + pos + '\t' + 'NUM' + '\n')
                            tmpfb.flush()
                            continue
                        elif pos == 'NOUN':
                            oovcorrectcnt += 1
                        tmpfb.write(word + '\t' + pos + '\t' + 'NOUN' + '\n')
                    tmpfb.close()
                    predictYDic[testlangKey] = (correctCnt + oovcorrectcnt) / ((len(testY) + oovtotalcnt) * 1.0)


                correctDic[trainlangKey][testtotaltoken] = predictYDic

            fb.write(json.dumps(correctDic,ensure_ascii=False)+'\n')
            fb.flush()
        fb.close()
Пример #39
0
labelProcessor = preprocessing.LabelEncoder()

for i in range(14):
    df.iloc[:,i] = labelProcessor.fit_transform(df.iloc[:,i])



Y = df.iloc[:,-1]
X = df.iloc[:,0:14]

bagger = BaggingClassifier(n_estimators=100, bootstrap_features=True)
bagger = bagger.fit(X,Y)

testDF = pd.read_csv(test)


test_predictions = bagger.predict(X)
print(accuracy_score(Y, test_predictions))


for i in range(1,15):
    testDF.iloc[:,i] = labelProcessor.fit_transform(testDF.iloc[:,i])
    

predictions = bagger.predict(testDF.iloc[:,1:15])

predictionDF = pd.DataFrame(predictions)
predictionDF["ID"] = testDF["ID"].values
predictionDF.to_csv('Predictions_bagging.csv', index=False, header=['Prediction','ID'])
    bagging3.fit(df_input3_data,numpy.ravel(df_input3_target))
    pickle.dump(bagging3, open('model_bagging_t3.pkl', 'wb'))

    bagging4 = BaggingClassifier(KNeighborsClassifier(n_neighbors=2),max_samples=0.3, max_features=0.1)
    bagging4.fit(df_input4_data,numpy.ravel(df_input4_target))
    pickle.dump(bagging4, open('model_bagging_t4.pkl', 'wb'))

    bagging5 = BaggingClassifier(KNeighborsClassifier(n_neighbors=2),max_samples=0.3, max_features=0.1)
    bagging5.fit(df_input5_data,numpy.ravel(df_input5_target))
    pickle.dump(bagging5, open('model_bagging_t5.pkl', 'wb'))

    # bagging = KMeans(n_clusters=5, random_state=RandomState(9)
    # bagging.fit(df_input_data,numpy.ravel(df_input_target))
    # pickle.dump(bagging, open('model_bagging_train.pkl', 'wb'))

    predicted1 = bagging1.predict(df_input1_data)
    predicted2 = bagging2.predict(df_input2_data)
    predicted3 = bagging3.predict(df_input3_data)
    predicted4 = bagging4.predict(df_input4_data)
    predicted5 = bagging5.predict(df_input5_data)
    # predicted = bagging.predict(df_input_data)

    matches1 = (predicted1 == [item for sublist in df_input1_target for item in sublist])
    matches2 = (predicted2 == [item for sublist in df_input2_target for item in sublist])
    matches3 = (predicted3 == [item for sublist in df_input3_target for item in sublist])
    matches4 = (predicted4 == [item for sublist in df_input4_target for item in sublist])
    matches5 = (predicted5 == [item for sublist in df_input5_target for item in sublist])
    # matches = (predicted == [item for sublist in df_input_target for item in sublist])

    print 'using excess rock & uncats removed'
    print "Accuracy of T1 : ", (matches1.sum() / float(len(matches1)))
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("决策树.pdf")

# model=Sequential()
# model.add(Dense(2*(X_train.shape[1]),input_shape=((X_train.shape[1]),)))
# model.add(Activation('relu'))
# model.add(Dense(1))
# model.add((Dropout(0.3)))
# model.compile(loss='mean_squared_error', optimizer='adam')
# model.summary()
#
# model.fit(X_train,y_train,epochs=10000,batch_size=50 )
# svmmodel=SVC()
# svmmodel.fit(X_train,y_test)

t=bagging_clf.predict(X_test)
joblib.dump(bagging_clf,'clf.model')

z=treemodel.predict(X_test)
joblib.dump(treemodel,'treemodel.model')

w=randomtree.predict(X_test)
joblib.dump(randomtree,'randomtree.model')

s=sgd.predict(X_test)
joblib.dump(sgd,'sgd.model')

# m=model.predict(X_test)
# model.save('NNmodel.h5')

rate1=0
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart,
                          n_estimators=num_trees,
                          random_state=seed).fit(X_train_tfidf, y)
results = model_selection.cross_val_score(model, X_train_tfidf, y, cv=kfold)
print(results.mean() * 100)

url1 = (
    "C:\\Users\\sidharth.m\\Desktop\\Project_sid_35352\\outputkrithika.csv")
documents1 = pd.read_csv(url1)
array1 = documents1.values
#choose tweet column
#x1 = array1[0:, 2]
x2 = (documents1['tweet']).astype(str)

X_test = count_vect.transform(x2)
#print(X_test.shape)

test = tfidf_transformer.transform(X_test)
#print(test.shape)

predicted = model.predict(test)
print(predicted)
Пример #43
0
		return cc

if __name__ == '__main__':
    X, y = make_classification(n_samples=300)
    x_test, y_test = X[0], y[0]
    X, y = X[1:], y[1:]


    X_train, X_val, Y_train, Y_val = train_test_split(X, y, test_size=0.2)
    X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train \
                                       , test_size=0.2)


    bag = BaggingClassifier(n_estimators=30)
    bag.fit(X_train, Y_train)
    Y_bag = bag.predict(X_test)

    desCV = DesCV(ensemble_clf=bag, X_val=X_val, y_val=Y_val)
    #y_pred = desCV.predict_pattern(x_test)

    #print y_pred
    #print y_test
    Y_pred = desCV.predict(X_test)
    print Y_pred
    print Y_test

    print accuracy_score(Y_pred, Y_test)
    print accuracy_score(Y_bag, Y_test)


Пример #44
0
# --
# Fit ASE

A = nx.to_numpy_array(G)
X_hat = AdjacencySpectralEmbed(algorithm='full').fit_transform(A)
X_hat = np.column_stack(X_hat)

# --
# Train classifiers

scores = np.zeros((n_class, args.n_iters))

for label_idx, label in enumerate(tqdm(ulabels)):
    for iter_idx in range(args.n_iters):

        X_train, X_test, y_train, y_test = train_test_split(
            X_hat,
            y == label,
            train_size=args.p_train,
            test_size=1 - args.p_train)

        model = BaggingClassifier(DecisionTreeClassifier())
        model = model.fit(X_train, y_train)
        y_hat = model.predict(X_test)

        scores[label_idx, iter_idx] = metrics.f1_score(y_test,
                                                       y_hat,
                                                       average='binary')

print('f1.mean', scores.mean(axis=-1))
print('f1.std', scores.std(axis=-1))
###############################################################################
# Classification using bagging classifier with and without sampling
###############################################################################
# Instead of using a single tree, we will check if an ensemble of decsion tree
# can actually alleviate the issue induced by the class imbalancing. First, we
# will use a bagging classifier and its counter part which internally uses a
# random under-sampling to balanced each boostrap sample.

bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0,
                                             n_jobs=-1)

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

y_pred_bc = bagging.predict(X_test)
y_pred_bbc = balanced_bagging.predict(X_test)

###############################################################################
# Balancing each bootstrap sample allows to increase significantly the balanced
# accuracy and the geometric mean.

print('Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bc),
              geometric_mean_score(y_test, y_pred_bc)))
cm_bagging = confusion_matrix(y_test, y_pred_bc)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_bagging, classes=np.unique(satimage.target), ax=ax[0],
                      title='Bagging')
Пример #46
0
coeff = np.abs(lr.coef_[0])
names = X_train.columns
coefficients = pd.Series(coeff, index=names)
sorted_coefficients = coefficients.sort_values()
plt.clf()
plt.tight_layout()
sorted_coefficients.plot(kind='barh', color='lightgreen')
plt.show()

#Bagging - decision tree
dt = DecisionTreeClassifier(max_depth=20,
                            min_samples_leaf=0.01,
                            random_state=1)
bc = BaggingClassifier(base_estimator=dt, n_estimators=300, n_jobs=-1)
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
y_pred_prob = bc.predict_proba(X_test)[:, 1]
accuracy_score(y_test, y_pred)
roc_auc_score(y_test, y_pred_prob)
f1_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

#Random forest
rf = RandomForestClassifier(n_estimators=300,
                            max_depth=30,
                            min_samples_leaf=0.0001,
                            max_features=12,
                            random_state=1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
table=[]
  
'''for name, clf in clfs:
    clf.fit(train_[cols], train_["TripType"])
    clf.predict(test_[cols])
    preds = clf.predict_proba(test_[cols])
    #print(confusion_matrix(test['class'], clf.predict(test[cols])))
    print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"]))
    print (classification_report(test_['TripType'], clf.predict(test_[cols])))
    score=accuracy_score(test_['TripType'],clf.predict(test_[cols]))
    table.append([name,score])
print (table)
'''
clf=BaggingClassifier(GradientBoostingClassifier())
clf.fit(train_[cols], train_["TripType"])
clf.predict(test_[cols])
preds = clf.predict_proba(test_[cols])
#print(confusion_matrix(test['class'], clf.predict(test[cols])))
print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"]))
print (classification_report(test_['TripType'], clf.predict(test_[cols])))
score=accuracy_score(test_['TripType'],clf.predict(test_[cols]))
table.append([score])
print (table)

eclf = VotingClassifier(estimators = [('BaggingKNN', BaggingClassifier(KNeighborsClassifier(20))),
    ('RandomForest', RandomForestClassifier(10)),
    ('BaggingCART', BaggingClassifier(DecisionTreeClassifier()))],
    voting='soft', weights=[7,1,1])
eclf.fit(train[cols], train["TripType"])
#use the classifier to predict
predicted=eclf.predict(test[cols])
Пример #48
0
class SearchEngine():
    def __init__(self, label_names, X_train, y_train):
        self.k = len(
            y_train
        )  # K is the number of clases, in this case, specializations
        self.label_names = label_names
        self.X_train, self.y_train = X_train, y_train

    def fit(self):
        # min_df: This corresponds to the minimum number of documents that should contain this feature.
        # max_df: we should include only those words that occur in a maximum of 70% of all the documents
        self.vectorizer = CountVectorizer(
            ngram_range=(1, 1),
            max_features=1500,
            min_df=5,
            max_df=0.4,
            stop_words=stopwords.words('english'))

        X_train_vect = self.vectorizer.fit_transform(self.X_train)
        self.tfidf_transformer = TfidfTransformer()
        X_train_trans = self.tfidf_transformer.fit_transform(X_train_vect)

        # Print TF and TFIDF
        #print(*list(X_train_vect.toarray()), sep = "\n")
        #print(*list(X_train_trans.toarray()), sep = "\n")

        # Uncomment the model to use
        #self.classifier = KNeighborsClassifier(n_neighbors=self.k)
        #self.classifier = RandomForestClassifier(n_estimators=500, max_features=0.25, criterion="entropy", class_weight="balanced")
        self.classifier = BaggingClassifier(n_estimators=25, max_features=0.25)
        #self.classifier = GradientBoostingClassifier(n_estimators =100, learning_rate =0.1, max_depth=6, min_samples_leaf =1, max_features=1.0) clf.fit(X, training_set_y)
        #self.classifier = MultinomialNB()

        self.classifier.fit(X_train_trans, self.y_train)

    def predict(self, X_test):
        X_test_vect = self.vectorizer.transform(X_test)
        X_test_trans = self.tfidf_transformer.transform(X_test_vect)
        y_pred = self.classifier.predict(X_test_trans)
        return y_pred

    def predict_single(self, doc):
        X_test_vect = self.vectorizer.transform([doc])
        X_test_trans = self.tfidf_transformer.transform(X_test_vect)
        y_pred = zip(self.classifier.classes_,
                     self.classifier.predict_proba(X_test_trans)[0])
        y_pred = sorted([(self.label_names[ind], score)
                         for ind, score in y_pred],
                        key=lambda x: -x[1])
        return y_pred

    def report(self, X_test, y_test, y_pred):
        print(
            classification_report(y_test,
                                  y_pred,
                                  target_names=self.label_names,
                                  digits=4))

        total = 0
        same = 0
        for i in range(len(y_test)):
            if y_test[i] == y_pred[i]:
                same += 1
            total += 1
        print(total, same)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix

l = list()

for i in range(0, 10):
    
    X_train, X_test, y_train, y_test = train_test_split(
            train, y, test_size=0.2)
    
    classifier = BaggingClassifier(base_estimator=None,
                                   )
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    confusion_matrix(y_test, y_pred)













Пример #50
0
voting_clf.fit(X, y)

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                            n_estimators=500,
                            max_samples=1.0,
                            bootstrap=True,
                            n_jobs=1)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(y_pred)
y_pred_proba = bag_clf.predict_proba(X_test)
print(y_pred_proba)
print(accuracy_score(y_test, y_pred))

#oob
bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                            n_estimators=500,
                            bootstrap=True,
                            n_jobs=1,
                            oob_score=True)
bag_clf.fit(X_train, y_train)
print(bag_clf.oob_score_)
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
    wine = datasets.load_wine()
    X = wine.data
    y = wine.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
    tree = DecisionTreeClassifier(criterion='entropy', max_depth=None)
    bag = BaggingClassifier(base_estimator=tree, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True,
                            bootstrap_features=False, n_jobs=-1, random_state=1)

    tree.fit(X_train, y_train)
    y_train_pred = tree.predict(X_train)
    y_test_pred = tree.predict(X_test)
    tree_train = accuracy_score(y_true=y_train, y_pred=y_train_pred)
    tree_test = accuracy_score(y_true=y_test, y_pred=y_test_pred)
    print("Decision tree train/test accuracy {0:.3f}/{1:.3f}".format(tree_train, tree_test))
    bag.fit(X_train, y_train)
    y_train_pred_bag = bag.predict(X_train)
    y_test_pred_bag = bag.predict(X_test)
    bag_train = accuracy_score(y_true=y_train, y_pred=y_train_pred_bag)
    bag_test = accuracy_score(y_true=y_test, y_pred=y_test_pred_bag)
    print("Bagging train/test accuracy {0:.3f}/{1:.3f}".format(bag_train, bag_test))
    x_min = X_train[:, 0].min() - 1
    x_max = X_train[:, 0].max() + 1
    y_min = X_train[:, 1].min() - 1
    y_max = X_train[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
    f, axarr = plt.subplots(nrows=1, ncols=2, sharex='col', sharey='row', figsize=(8, 3))
    for idx, clf, tt in zip([0, 1], [tree, bag], ['Decision Tree', 'Bagging']):
        clf.fit(X_train[:, 0:2], y_train)
        z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
        z = z.reshape(xx.shape)
        axarr[idx].contourf(xx, yy, z, alpha=0.3)
Пример #52
0
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
#print(X_train_tfidf.shape)

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart,
                          n_estimators=num_trees,
                          random_state=seed).fit(X_train_tfidf, y)
results = model_selection.cross_val_score(model, X_train_tfidf, y, cv=kfold)
print(results.mean() * 100)

predicted = model.predict(X_train_tfidf)
acc = accuracy_score(y, predicted) * 100
print(acc)

url1 = ("C:\\Users\\sidharth.m\\Desktop\\Project_sid_35352\\Test.csv")
documents1 = pd.read_csv(url1)
array1 = documents1.values
#choose tweet column
x1 = array1[0:, 1]
#x2= (documents1['tweet']).astype(str)

y1 = array1[0:, 0]

X_test = count_vect.transform(x1)
#print(X_test.shape)
Пример #53
0
def main():
    """magic happens here"""
    # preprocess, then train, test, and split
    chess_num_datatrain, chess_num_datatest, chess_num_targettrain, chess_num_targettest = tts_chess_numeric(
    )
    iris_num_datatrain, iris_num_datatest, iris_num_targettrain, iris_num_targettest = tts_iris_numeric(
    )
    letter_num_datatrain, letter_num_datatest, letter_num_targettrain, letter_num_targettest = tts_letter_numeric(
    )

    # For each dataset
    ## Try at least 3 different "regular" learning algorithms and note the results.
    ### DS1 - chess
    print("")
    ##### method 1 - MLP **
    clf_chess_num_MLP = MLPClassifier(solver='adam',
                                      alpha=1e-5,
                                      hidden_layer_sizes=(40, 30),
                                      random_state=1)
    clf_chess_num_MLP.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_MLP.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest,
                       "Chess - Neural Network")
    ##### method 2 - Decision Tree
    clf_chess_num_DT = DecisionTreeClassifier(random_state=0)
    clf_chess_num_DT.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_DT.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest,
                       "Chess - Decision Tree")
    ##### method 3 - KNN
    clf_chess_num_KNN = KNeighborsClassifier(n_neighbors=7)
    clf_chess_num_KNN.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_KNN.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest, "Chess - KNN")
    ### DS2 - iris
    print("")
    ##### method 1 - MLP
    clf_iris_num_MLP = MLPClassifier(solver='adam',
                                     alpha=1e-5,
                                     hidden_layer_sizes=(10, 7),
                                     random_state=1)
    clf_iris_num_MLP.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_MLP.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest,
                       "Iris - Neural Network")
    # clf_iris_num_MLP_gs = MLPClassifier()
    # iris_param_grid = [
    #     {
    #         'activation' : ['identity', 'logistic', 'tanh', 'relu'],
    #         'solver' : ['lbfgs', 'sgd', 'adam'],
    #         'hidden_layer_sizes': [
    #          (9,1),(9,2),(9,3),(9,4),(9,5),(9,6),(9,7),(9,8),(9,10),(9,11),(9,12),
    #          (10,1),(10,2),(10,3),(10,4),(10,5),(10,6),(10,7),(10,8),(10,10),(10,11),(10,12),
    #          (11,1),(11,2),(11,3),(11,4),(11,5),(11,6),(11,7),(11,8),(11,10),(11,11),(11,12)
    #          ]
    #     }
    #    ]
    # grid_clf = GridSearchCV(clf_iris_num_MLP_gs, iris_param_grid, cv=3,
    #                        scoring='accuracy')
    # grid_clf.fit(iris_num_datatrain, iris_num_targettrain)
    # print("the best parameters out of those chosen are: ")
    # print(grid_clf.best_params_)
    ##### method 2 - Decision Tree
    clf_iris_num_DT = DecisionTreeClassifier()
    clf_iris_num_DT.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_DT.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest,
                       "Iris - Decision Tree")
    ##### method 3 - KNN
    clf_iris_num_KNN = KNeighborsClassifier(n_neighbors=3)
    clf_iris_num_KNN.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_KNN.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest, "Iris - KNN")
    ### DS3
    print("")
    ##### method 1 - MLP
    clf_letter_num_MLP = MLPClassifier(solver='adam',
                                       alpha=1e-5,
                                       hidden_layer_sizes=(40, 30),
                                       random_state=1)
    clf_letter_num_MLP.fit(letter_num_datatrain, letter_num_targettrain)
    predictions = clf_letter_num_MLP.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest,
                       "Letter - Neural Network")
    ##### method 2 - Decision Tree
    clf_letter_num_DT = DecisionTreeClassifier()
    clf_letter_num_DT.fit(letter_num_datatrain, letter_num_targettrain)
    predictions = clf_letter_num_DT.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest,
                       "Letter - Decision Tree")
    ##### method 3 - KNN
    clf_letter_num_KNN = KNeighborsClassifier(n_neighbors=3)
    clf_letter_num_KNN.fit(letter_num_datatrain, letter_num_targettrain)
    predictions = clf_letter_num_KNN.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest, "Letter - KNN")
    print("")

    ## Use Bagging and note the results. (Play around with a few different options)
    ### DS1 - Chess
    clf_chess_num_Bagging = BaggingClassifier(bootstrap=True, n_estimators=20)
    clf_chess_num_Bagging.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_Bagging.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest, "BAGGING - Chess")
    ### DS2 - Iris
    clf_iris_num_Bagging = BaggingClassifier(bootstrap=True)
    clf_iris_num_Bagging.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_Bagging.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest, "BAGGING - Iris")
    ### DS3 - Letter
    clf_letter_num_Bagging = BaggingClassifier(bootstrap=True, n_estimators=20)
    clf_letter_num_Bagging.fit(letter_num_datatrain, letter_num_targettrain)
    predictions = clf_letter_num_Bagging.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest, "BAGGING - Letter")
    print("")

    ## Use AdaBoost and note the results. (Play around with a few different options)
    ### DS1 - Chess
    clf_chess_num_AdaBoost = AdaBoostClassifier()
    clf_chess_num_AdaBoost.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_AdaBoost.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest, "ADABOOST - Chess")
    params = clf_chess_num_AdaBoost.get_params()
    print(params)
    ### DS2 - Iris
    clf_iris_num_AdaBoost = AdaBoostClassifier(learning_rate=0.3)
    clf_iris_num_AdaBoost.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_AdaBoost.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest, "ADABOOST - Iris")
    params = clf_iris_num_AdaBoost.get_params()
    print(params)
    ### DS3 - Letter
    clf_letter_num_AdaBoost = AdaBoostClassifier(n_estimators=200)
    clf_letter_num_AdaBoost.fit(letter_num_datatrain, letter_num_targettrain)
    predictions = clf_letter_num_AdaBoost.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest, "ADABOOST - Letter")
    params = clf_letter_num_AdaBoost.get_params()
    print(params)
    print("")

    ## Use a random forest and note the results. (Play around with a few different options)
    ### DS1 - Chess
    clf_chess_num_RandomForest = RandomForestClassifier(criterion='entropy',
                                                        bootstrap=False,
                                                        n_estimators=30)
    clf_chess_num_RandomForest.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_RandomForest.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest,
                       "RANDOM FOREST - Chess")
    ### DS2 - Iris
    clf_iris_num_RandomForest = RandomForestClassifier()
    clf_iris_num_RandomForest.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_RandomForest.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest,
                       "RANDOM FOREST - Iris")
    ### DS3 - Letter
    clf_letter_num_RandomForest = RandomForestClassifier(bootstrap=False)
    clf_letter_num_RandomForest.fit(letter_num_datatrain,
                                    letter_num_targettrain)
    predictions = clf_letter_num_RandomForest.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest,
                       "RANDOM FOREST - Letter")
print("model_1 정확도(학습 데이터) :", model_1.score(X_train, y_train))
print("model_2 정확도(학습 데이터) :", model_2.score(X_train, y_train))

print("model_1 정확도(테스트 데이터) :", model_1.score(X_test, y_test))
print("model_2 정확도(테스트 데이터) :", model_2.score(X_test, y_test))

predicted_1 = model_1.predict(X_test)

print('Confusion Matrix - 1:')
print(confusion_matrix(y_test, predicted_1))

print('Classification Report - 1 :')
print(classification_report(y_test, predicted_1))

predicted_2 = model_2.predict(X_test)

print('Confusion Matrix - 1:')
print(confusion_matrix(y_test, predicted_2))

print('Classification Report - 1 :')
print(classification_report(y_test, predicted_2))








Пример #55
0
def test_parallel():
    """Check parallel computations."""
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    for n_jobs in [-1, 3]:
        ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                     n_jobs=n_jobs,
                                     random_state=0).fit(X_train, y_train)

        # predict_proba
        ensemble.set_params(n_jobs=1)
        y1 = ensemble.predict_proba(X_test)
        ensemble.set_params(n_jobs=2)
        y2 = ensemble.predict_proba(X_test)
        assert_array_almost_equal(y1, y2)

        ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                     n_jobs=1,
                                     random_state=0).fit(X_train, y_train)

        y3 = ensemble.predict_proba(X_test)
        assert_array_almost_equal(y1, y3)

        # decision_function
        ensemble = BaggingClassifier(SVC(),
                                     n_jobs=n_jobs,
                                     random_state=0).fit(X_train, y_train)

        ensemble.set_params(n_jobs=1)
        decisions1 = ensemble.decision_function(X_test)
        ensemble.set_params(n_jobs=2)
        decisions2 = ensemble.decision_function(X_test)
        assert_array_almost_equal(decisions1, decisions2)

        ensemble = BaggingClassifier(SVC(),
                                     n_jobs=1,
                                     random_state=0).fit(X_train, y_train)

        decisions3 = ensemble.decision_function(X_test)
        assert_array_almost_equal(decisions1, decisions3)

    # Regression
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    for n_jobs in [-1, 3]:
        ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                    n_jobs=3,
                                    random_state=0).fit(X_train, y_train)

        ensemble.set_params(n_jobs=1)
        y1 = ensemble.predict(X_test)
        ensemble.set_params(n_jobs=2)
        y2 = ensemble.predict(X_test)
        assert_array_almost_equal(y1, y2)

        ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                    n_jobs=1,
                                    random_state=0).fit(X_train, y_train)

        y3 = ensemble.predict(X_test)
        assert_array_almost_equal(y1, y3)
Пример #56
0
    else:
        estimator = SVC()
        clf = SVC()

    clf.fit(X_train, y_train)
    y_pred_tree = clf.predict(X_test)

    bag_clf = BaggingClassifier(estimator,
                                n_estimators=n_estimators,
                                max_samples=max_samples,
                                bootstrap=bootstrap_samples,
                                max_features=max_features,
                                bootstrap_features=bootstrap_features,
                                random_state=42)
    bag_clf.fit(X_train, y_train)
    y_pred = bag_clf.predict(X_test)

    orig.empty()

    fig, ax = plt.subplots()
    fig1, ax1 = plt.subplots()

    XX, YY, input_array = draw_meshgrid()
    labels = clf.predict(input_array)
    labels1 = bag_clf.predict(input_array)

    col1, col2 = st.beta_columns(2)
    with col1:
        st.header(estimators)
        ax.scatter(X.T[0], X.T[1], c=y, cmap='rainbow')
        ax.contourf(XX,
Пример #57
0
from sklearn.ensemble import BaggingClassifier
tree = DecisionTreeClassifier(criterion='entropy',
                              random_state=1,
                              max_depth=None)
bag = BaggingClassifier(base_estimator=tree,
                        n_estimators=50,
                        max_samples=1.0,
                        max_features=1.0,
                        bootstrap=True,
                        bootstrap_features=False,
                        n_jobs=1,
                        random_state=1)

##
from sklearn.metrics import accuracy_score
tree = tree.fit(X_train, y_train)
t_train_pred = tree.predict(X_train)
t_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, t_train_pred)
tree_test = accuracy_score(y_test, t_test_pred)
print('Decision tree tain/test accuracy %.3f/%.3f' % (tree_train, tree_test))

##
from sklearn.metrics import accuracy_score
bag = bag.fit(X_train, y_train)
b_train_pred = bag.predict(X_train)
b_test_pred = bag.predict(X_test)
bag_train = accuracy_score(y_train, b_train_pred)
bag_test = accuracy_score(y_test, b_test_pred)
print('Bag tain/test accuracy %.3f/%.3f' % (bag_train, bag_test))
from sklearn.svm import SVC
clf5 = SVC(kernel='rbf')

import xgboost as xgb
model = xgb.XGBClassifier(random_state=1, learning_rate=0.01)

from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(base_estimator=clf1, n_estimators=30, random_state=0)

#x1,y1=SMOTE().fit_resample(x1, y1)

print("Starting... ")
clf.fit(x1, y1)

o = clf.predict(x2)
print("End... ")
pred_aud = clf.predict_proba(x2)

cou = 0
tol = 0

pos = [0 for i in range(len(pred_aud[0]))]
pos1 = [0 for i in range(len(pred_aud[0]))]
pos2 = [0 for i in range(len(pred_aud[0]))]

for i in tqdm(range(len(o))):

    tol += 1.0

    pos1[y2[i]] += 1
Пример #59
0
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'):
    #NOTE we might not need xtltrain
    # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present
    #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength
    # ytest is optional and depends on if you are using a testing set or the practice set

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget)
    # print 'finished removal of Nans'

    ytrain = np.ravel(ytrain)
    ytarget = np.ravel(ytarget)


    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []
    targetStringMat = []
    targets1 = []
    predictions1 = []

    # svc1 = SVC()
    # svc1.fit(xtrain,ytrain)
    # ytest = svc1.predict(xtest)
    # predictionMat[:,count] = ytest
    # count+=1
    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:

        bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
        bagging2.fit(xtrain,ytrain)
        #print bagging2.score(xtest,ytest)
        ytest = bagging2.predict(xtest)
        predictionMat[:,count] = ytest
        count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    # print xtltest
    # print len(ytest)
    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        if testing:
            modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0)
        else:
            modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0)

        ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0)
        if testing:
             modeStr = temppredVec2Str(modeCol,grids)
        else:
            modeStr = predVec2Str(modeCol)
        modeStrans = predVec2Str(ytarg)
        predictionStringMat.append(modeStr)
        predictions1.append(modeCol)
        finalPredMat += map(int,modeCol)
        targetStringMat.append(modeStrans)
        targets1.append(ytarg)
        if testing == False:
            if ytarget != None:
                #print targets1
                #print ""
                #print predictions1
                confusionme = confusion_matrix(targets1[0],predictions1[0])
                #print "Confusion Matrix is: "
                #print confusionme


    return predictionStringMat, targetStringMat, finalPredMat
Пример #60
0
# -*- coding: utf-8 -*-

from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
clf_bagging = BaggingClassifier(KNeighborsClassifier(),
                                max_samples=0.5,
                                max_features=0.5)

clf_bagging.predict()