Пример #1
1
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    assert_warns_message(UserWarning,
                         "Warm-start fitting without increasing n_estimators does not",
                         clf.fit, X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
Пример #2
0
def query_by_bagging(X, y, current_model, batch_size, rng, base_model=SVC(C=1, kernel='linear'), n_bags=5, method="KL", D=None):
    """
    :param base_model: Model that will be  **fitted every iteration**
    :param n_bags: Number of bags on which train n_bags models
    :param method: 'entropy' or 'KL'
    :return:
    """
    assert method == 'entropy' or method == 'KL'
    eps = 0.0000001
    if method == 'KL':
        assert hasattr(base_model, 'predict_proba'), "Model with probability prediction needs to be passed to this strategy!"
    clfs = BaggingClassifier(base_model, n_estimators=n_bags, random_state=rng)
    clfs.fit(X[y.known], y[y.known])
    pc = clfs.predict_proba(X[np.invert(y.known)])
    # Settles page 17
    if method == 'entropy':
        pc += eps
        fitness = np.sum(pc * np.log(pc), axis=1)
        ids =  np.argsort(fitness)[:batch_size]
    elif method == 'KL':
        p = np.array([clf.predict_proba(X[np.invert(y.known)]) for clf in clfs.estimators_])
        fitness = np.mean(np.sum(p * np.log(p / pc), axis=2), axis=0)
        ids = np.argsort(fitness)[-batch_size:]

    return y.unknown_ids[ids], fitness/np.max(fitness)
Пример #3
0
class BaggingSK(PoolGenerator):
    '''
    This class should not be used, use brew.generation.bagging.Bagging instead.
    '''

    def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers

        # using the sklearn implementation of bagging for now
        self.sk_bagging = BaggingClassifier(base_estimator=base_classifier,
                n_estimators=n_classifiers, max_samples=1.0, max_features=1.0)
        
        self.ensemble = Ensemble()
        self.combiner = Combiner(rule=combination_rule)

    def fit(self, X, y):
        self.sk_bagging.fit(X, y)
        self.ensemble.add_classifiers(self.sk_bagging.estimators_)
        #self.classes_ = set(y)

    def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out)
def baggedDecisionTree( X_train, y_train, X_test, y_test, nEstimators ):

    print("\n### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###")
    print("baggedDecisionTree()\n")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myBaggedDecisionTree = BaggingClassifier(
        base_estimator = DecisionTreeClassifier(),
        n_estimators   = nEstimators,
        # max_samples    = X_train.shape[0],
        bootstrap      = True,
        oob_score      = True,
        n_jobs         = -1 # use all available cores
        )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myBaggedDecisionTree.fit(X_train,y_train)
    y_pred = myBaggedDecisionTree.predict(X_test)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    print( "nEstimators: "      + str(nEstimators)                     )
    print( "out-of-bag score: " + str(myBaggedDecisionTree.oob_score_) )
    print( "accuracy score: "   + str(accuracy_score(y_test,y_pred))   )
    print( "out-of-bag decision function:" )
    print( str(myBaggedDecisionTree.oob_decision_function_) )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Пример #5
0
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
    assert_true(isinstance(estimator[0].steps[-1][1].random_state,
                           int))
def bagging(X_train, X_test, y_train, y_test,n_est):
    n_est=51
    estimators=range(1,n_est)
    decision_clf = DecisionTreeClassifier()
    
    for est in estimators:
        bagging_clf = BaggingClassifier(decision_clf, n_estimators=est, max_samples=0.67,max_features=0.67, 
                                    bootstrap=True, random_state=9)
        bagging_clf.fit(X_train, y_train)
        # test line
        y_pred_bagging1 = bagging_clf.predict(X_test)
        score_bc_dt1 = accuracy_score(y_test, y_pred_bagging1)
        scores1.append(score_bc_dt1)
        # train line
        y_pred_bagging2 = bagging_clf.predict(X_train)
        score_bc_dt2 = accuracy_score(y_train, y_pred_bagging2)
        scores2.append(score_bc_dt2)
    
    plt.figure(figsize=(10, 6))
    plt.title('Bagging Info')
    plt.xlabel('Estimators')
    plt.ylabel('Scores')
    plt.plot(estimators,scores1,'g',label='test line', linewidth=3)
    plt.plot(estimators,scores2,'c',label='train line', linewidth=3)
    plt.legend()
    plt.show()
Пример #7
0
def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    for base_estimator in [DecisionTreeClassifier(), SVC()]:
        clf = BaggingClassifier(base_estimator=base_estimator,
                                n_estimators=100,
                                bootstrap=True,
                                oob_score=True,
                                random_state=rng).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert_less(abs(test_score - clf.oob_score_), 0.1)

        # Test with few estimators
        assert_warns(UserWarning,
                     BaggingClassifier(base_estimator=base_estimator,
                                       n_estimators=1,
                                       bootstrap=True,
                                       oob_score=True,
                                       random_state=rng).fit,
                     X_train,
                     y_train)
Пример #8
0
def test_bagging_sample_weight_unsupported_but_passed():
    estimator = BaggingClassifier(DummyZeroEstimator())
    rng = check_random_state(0)

    estimator.fit(iris.data, iris.target).predict(iris.data)
    assert_raises(ValueError, estimator.fit, iris.data, iris.target,
                  sample_weight=rng.randint(10, size=(iris.data.shape[0])))
Пример #9
0
def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BaggingClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    assert_raises(ValueError, clf.fit, X, y)
Пример #10
0
def test_estimators_samples():
    # Check that format of estimators_samples_ is correct and that results
    # generated at fit time can be identically reproduced at a later time
    # using data saved in object attributes.
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5,
                                max_features=0.5, random_state=1,
                                bootstrap=False)
    bagging.fit(X, y)

    # Get relevant attributes
    estimators_samples = bagging.estimators_samples_
    estimators_features = bagging.estimators_features_
    estimators = bagging.estimators_

    # Test for correct formatting
    assert_equal(len(estimators_samples), len(estimators))
    assert_equal(len(estimators_samples[0]), len(X) // 2)
    assert_equal(estimators_samples[0].dtype.kind, 'i')

    # Re-fit single estimator to test for consistent sampling
    estimator_index = 0
    estimator_samples = estimators_samples[estimator_index]
    estimator_features = estimators_features[estimator_index]
    estimator = estimators[estimator_index]

    X_train = (X[estimator_samples])[:, estimator_features]
    y_train = y[estimator_samples]

    orig_coefs = estimator.coef_
    estimator.fit(X_train, y_train)
    new_coefs = estimator.coef_

    assert_array_almost_equal(orig_coefs, new_coefs)
Пример #11
0
class ADABoost(Base):

    def train(self, data = None, plugin=None):
        """ With dataframe train mllib """
        super(ADABoost, self).train(data, plugin)

            #cl = svm.SVC(gamma=0.001, C= 100, kernel='linear', probability=True)

        X = self.X_train.iloc[:,:-1]
        Y = self.X_train.iloc[:,-1]

        self.scaler = StandardScaler().fit(X)
        X = self.scaler.transform(X)

        cl = SGDClassifier(loss='hinge')
        p = Pipeline([("Scaler", self.scaler), ("svm", cl)])

        self.clf = BaggingClassifier(p, n_estimators=50)
        #self.clf = AdaBoostClassifier(p, n_estimators=10)
            #self.clf = AdaBoostClassifier(SGDClassifier(loss='hinge'),algorithm='SAMME', n_estimators=10)

        self.clf.fit(X, Y)

    def predict(self, file, plugin=None):
        super(ADABoost, self).predict(file, plugin)

        data = file.vector
        X = data[plugin]
        X = self.scaler.transform(X)
        guess = self.clf.predict(X)
        return self.getTag(guess)
Пример #12
0
def test_estimators_samples_deterministic():
    # This test is a regression test to check that with a random step
    # (e.g. SparseRandomProjection) and a given random state, the results
    # generated at fit time can be identically reproduced at a later time using
    # data saved in object attributes. Check issue #9524 for full discussion.

    iris = load_iris()
    X, y = iris.data, iris.target

    base_pipeline = make_pipeline(SparseRandomProjection(n_components=2),
                                  LogisticRegression())
    clf = BaggingClassifier(base_estimator=base_pipeline,
                            max_samples=0.5,
                            random_state=0)
    clf.fit(X, y)
    pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()

    estimator = clf.estimators_[0]
    estimator_sample = clf.estimators_samples_[0]
    estimator_feature = clf.estimators_features_[0]

    X_train = (X[estimator_sample])[:, estimator_feature]
    y_train = y[estimator_sample]

    estimator.fit(X_train, y_train)
    assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
Пример #13
0
def train_classifiers(data):
    train_vars = [
        'X', 'Y',
        'Darkness',
        'Moon',
        'Hour',
        'DayOfWeekInt',
        'Day',
        'Month',
        'Year',
        'PdDistrictInt',
        'TemperatureC',
        'Precipitationmm',
        'InPdDistrict',
        'Conditions',
        'AddressCode',
    ]
    weather_mapping = {
        'Light Drizzle': 1,
        'Drizzle': 2,
        'Light Rain': 3,
        'Rain': 4,
        'Heavy Rain': 5,
        'Thunderstorm': 6,
    }
    data.Precipitationmm = data.Precipitationmm.fillna(-1)
    data.Conditions = data.Conditions.map(weather_mapping).fillna(0)

    train, test = split(data)
    X_train = train[train_vars]
    y_train = train.CategoryInt
    X_test = test[train_vars]
    y_test = test.CategoryInt

    bdt_real_2 = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=8),
        n_estimators=10,
        learning_rate=1
    )

    #bdt_real = DecisionTreeClassifier(max_depth=None, min_samples_split=1,
                                      #random_state=6065)

    bdt_real = BaggingClassifier(base_estimator=bdt_real_2,
                                random_state=6065,
                                n_estimators=100)

    #bdt_real = RandomForestClassifier(random_state=6065,
                                      #n_estimators=200)

    #bdt_real = ExtraTreesClassifier(random_state=6065,
                                    #min_samples_split=5,
                                    #n_estimators=200)

    bdt_real.fit(X_train, y_train)
    y_predict = pandas.Series(bdt_real.predict(X_test))
    print len(y_predict[y_predict == y_test])
    print len(y_predict)
    return bdt_real
Пример #14
0
 def create_estimators(self, X_train, y_train, X_test):
     for model in self.models:
         param_grid = self.create_parameter_grid(model)
         for parameters in param_grid:
             clf = BaggingClassifier(base_estimator=model.set_params(**parameters), n_estimators=self.estimators, max_samples=0.95, n_jobs = 3)
             clf.fit(X_train, y_train)
             prediction = clf.predict_proba(X_test)[:,1]
             self.predictions.append(prediction)
Пример #15
0
def test_oob_score_consistency():
    # Make sure OOB scores are identical when random_state, estimator, and
    # training data are fixed and fitting is done twice
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5,
                                max_features=0.5, oob_score=True,
                                random_state=1)
    assert_equal(bagging.fit(X, y).oob_score_, bagging.fit(X, y).oob_score_)
Пример #16
0
def test_max_samples_consistency():
    # Make sure validated max_samples and original max_samples are identical
    # when valid integer max_samples supplied by user
    max_samples = 100
    X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1)
    bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1)
    bagging.fit(X, y)
    assert_equal(bagging._max_samples, max_samples)
Пример #17
0
    def classification(self, x_train, y_train):
        ml = BaggingClassifier(DecisionTreeClassifier())
        ml.fit(x_train, y_train)
#         print y_train[0]
#         print x_train[0]
        y_pred = ml.predict(x_train)
        print 'y_train ',y_train
        print 'y_pred ',y_pred.tolist()
Пример #18
0
def test_bagging_small_max_features():
    # Check that Bagging estimator can accept low fractional max_features

    X = np.array([[1, 2], [3, 4]])
    y = np.array([1, 0])

    bagging = BaggingClassifier(LogisticRegression(),
                                max_features=0.3, random_state=1)
    bagging.fit(X, y)
Пример #19
0
def test_sparse_classification():
    # Check classification for various parameter settings on sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set"""

        def fit(self, X, y):
            super(CustomSVC, self).fit(X, y)
            self.data_type_ = type(X)
            return self

    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)
    parameter_sets = [
        {"max_samples": 0.5,
         "max_features": 2,
         "bootstrap": True,
         "bootstrap_features": True},
        {"max_samples": 1.0,
         "max_features": 4,
         "bootstrap": True,
         "bootstrap_features": True},
        {"max_features": 2,
         "bootstrap": False,
         "bootstrap_features": True},
        {"max_samples": 0.5,
         "bootstrap": True,
         "bootstrap_features": False},
    ]

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)
        for params in parameter_sets:

            # Trained on sparse format
            sparse_classifier = BaggingClassifier(
                base_estimator=CustomSVC(),
                random_state=1,
                **params
            ).fit(X_train_sparse, y_train)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_results = BaggingClassifier(
                base_estimator=CustomSVC(),
                random_state=1,
                **params
            ).fit(X_train, y_train).predict(X_test)

            sparse_type = type(X_train_sparse)
            types = [i.data_type_ for i in sparse_classifier.estimators_]

            assert_array_equal(sparse_results, dense_results)
            assert all([t == sparse_type for t in types])
def train_and_test(X_train, X_test, y_train, y_test):
    forest = BaggingClassifier(n_estimators=500, random_state=1234)
    forest = forest.fit(X_train, y_train)
    proba = forest.predict_proba(X_test)
    proba = proba[:, 1]
    y_test = np.array(y_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, proba, pos_label=1)
    loss = metrics.auc(fpr, tpr)
    print loss
    return loss
Пример #21
0
def test_bagging_classifier_with_missing_inputs():
    # Check that BaggingClassifier can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y = np.array([3, 6, 6, 6, 6])
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(
        FunctionTransformer(replace, validate=False),
        classifier
    )
    pipeline.fit(X, y).predict(X)
    bagging_classifier = BaggingClassifier(pipeline)
    bagging_classifier.fit(X, y)
    y_hat = bagging_classifier.predict(X)
    assert_equal(y.shape, y_hat.shape)
    bagging_classifier.predict_log_proba(X)
    bagging_classifier.predict_proba(X)

    # Verify that exceptions can be raised by wrapper classifier
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(classifier)
    assert_raises(ValueError, pipeline.fit, X, y)
    bagging_classifier = BaggingClassifier(pipeline)
    assert_raises(ValueError, bagging_classifier.fit, X, y)
Пример #22
0
class BaggingClassifier(BaseEstimator):
    def __init__(self, base_estimator=None, bag_kwargs=None):
        klass = dynamic_load(base_estimator['class'])
        svc = klass(**base_estimator['params'])
        self.__clf = SK_BaggingClassifier(base_estimator=svc, **bag_kwargs)

    def fit(self, X, y):
        return self.__clf.fit(X, y)

    def predict_proba(self, X):
        return self.__clf.predict_proba(X)
Пример #23
0
def predict_with_best_model(estimator, xtrain, ytrain, xtest):
    from sklearn.ensemble import BaggingClassifier
    model = BaggingClassifier(base_estimator=estimator, n_estimators=10, max_samples=0.9, max_features=0.9, n_jobs=1, 
                              bootstrap=False, bootstrap_features=False, oob_score=False)
    model = model.fit(xtrain,ytrain)
    y = model.predict_proba(xtest)
#     print("Bagging score with oob estimates: ")
#     print model.oob_score_
    print ("Model used: ")
    print model.base_estimator_
    return y 
Пример #24
0
class BaggingLearner(AbstractLearner):

    def __init__(self):
        self.learner = BaggingClassifier(KNeighborsClassifier())

    def _train(self, x_train, y_train):
        self.learner = self.learner.fit(x_train, y_train)

    def _predict(self, x):
        return self.learner.predict(x)

    def _predict_proba(self, x):
        return self.learner.predict_proba(x)
def phenotype_imputation(data, config):
    ''' 
    Function to impute the labels on II based on the classifier learned on I.
    
    Parameters 
    ---------- 
    data : an object of class Dataset that contains: genotypes, covariates, 
        labels and information about random folds 

    config : an object of class ConfigState. It contains the user-entered 
        parameters in a YAML format.
        See the config_file parameter in the main script for more details.
    '''
    # Parameters for this task
    num_folds = data.num_folds  
    task_name    = "phenotype_imputation"
    n_estimators = config.get_entry(task_name, "n_estimators")
    romans_trn   = config.get_entry(task_name, "romans_used_for_learning")
    romans_tst   = config.get_entry(task_name, "romans_used_for_imputing")
    
    # Iterate through the folds: 
    i = 0
    size_of_two = find_vec_entries_that_contain(data.folds[:,0], romans_tst).shape[0]
    soft_labels = np.zeros((size_of_two, num_folds))
    X_scaled = preprocessing.scale(data.clin_covariate.transpose()).transpose()
    fpr = dict()
    tpr = dict()
    thres = dict()
    roc_auc = np.zeros(num_folds)
    for fold in data.folds.transpose():      
        logging.info("Fold=%d" % (i + 1))
        sel_trn = find_vec_entries_that_contain(fold,[romans_trn])
        sel_tst = find_vec_entries_that_contain(fold,[romans_tst])

        model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(),
                    n_estimators=n_estimators, max_samples=0.632, 
# for small set I   n_estimators=n_estimators, max_samples=0.8, 
                    max_features=5, 
                    bootstrap=True, bootstrap_features=True, oob_score=False, 
# for small set I   bootstrap=False, bootstrap_features=True, oob_score=False, 
                    n_jobs=1, random_state=None, verbose=0)
            
        model.fit(X_scaled[:,sel_trn].transpose(), data.labels[:,sel_trn].transpose())

        soft_labels[:,i] = model.predict_proba(X_scaled[:,sel_tst].transpose())[:,1]
        fpr[i], tpr[i], thres[i] = metrics.roc_curve(data.labels[0,sel_tst], soft_labels[:,i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])
        i+=1

    # Save the output of this task
    config.save_variable(task_name, "%f", soft_labels=soft_labels, roc_auc=roc_auc)
Пример #26
0
class BaggingDecisionTrees(object):

    def __init__(self, n_estimators):
        self.classifier = BaggingClassifier(n_estimators=n_estimators)

    def fit(self, xs, ys):
        xs = xs.values
        ys = ys['y']
        self.classifier.fit(xs, ys)

    def predict(self, xs):
        xs = xs.values
        ys = self.classifier.predict(xs)
        return ys
Пример #27
0
def test_probability():
    # Predict probabilities.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                     random_state=rng).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BaggingClassifier(base_estimator=LogisticRegression(),
                                     random_state=rng,
                                     max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
def main():
    # The competition datafiles are in the directory /input

    # Read output csv format in case the file does not exists
    submit = pd.read_csv('sample_submission.csv')

    # Training cols
    print ("Loading training csv.")
    #train_cols = ['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster']
    train_cols = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country', 'hotel_cluster']
    train = pd.DataFrame(columns=train_cols)
    train_chunk = pd.read_csv('input/train.csv', chunksize=100000)
    print ("Training csv loaded.")

    # Read each chunk to train
    for chunk in train_chunk:
        #train = pd.concat( [ train, chunk ] )
        train = pd.concat( [ train, chunk[chunk['is_booking']==1][train_cols] ] )
        print ("Chunk done")
    # Load each column
    #x_train = train[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values
    x_train = train[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values
    y_train = train['hotel_cluster'].values

    # Run RandomForest on training data
    print ("Training RandomForest.")
    rf = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=4)
    bclf = BaggingClassifier(rf, n_estimators=2, n_jobs=4)
    bclf.fit(x_train, y_train)
    print ("Training done.")

    print ("Loading testing csv.")
    test_chunk = pd.read_csv('input/test.csv', chunksize=100000)
    print ("Begin testing each chunk.")
    predict = np.array([])
    # Read each chunk to test
    for i, chunk in enumerate(test_chunk):
        #test_X = chunk[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values
        test_X = chunk[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values
        test_X = np.nan_to_num(test_X)
        if i > 0:
            predict = np.concatenate( [predict, bclf.predict_proba(test_X)])
        else:
            predict = bclf.predict_proba(test_X)
        print ("Chunk id: " + str(i))

    submit['hotel_cluster'] = np.apply_along_axis(get5Best, 1, predict)
    submit.head()
    submit.to_csv('submission_random_forest.csv', index=False)
Пример #29
0
def run_bagging(training_set, train_set_labels,  clsf,validation_set=None, validation_set_labels=None , facc=False):
    from sklearn.ensemble import BaggingClassifier

    bgc = BaggingClassifier(base_estimator=clsf, n_estimators=11, max_samples=1.0, max_features=1.0, bootstrap=True,
                            bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=1, random_state=None,
                            verbose=0)
    # standard_train_inputs = standard_data(training_set)
    # standard_valid_inputs = standard_data(validation_set)
    fbgc = bgc.fit(training_set,train_set_labels.ravel())
    if facc:
        acc = fbgc.score(validation_set,validation_set_labels.ravel())
        print(acc)
        return acc
    else:
        return fbgc
def bagging_with_base_estimator(base_estimator, x_train, x_test, y_train,
                                y_test, rands = None):
    """
    Predict the lemons using a Bagging Classifier and a random seed
    both for the number of features, as well as for the size of the
    sample to train the data on

    ARGS:

        - x_train: :class:`pandas.DataFrame` of the x_training data

        - y_train: :class:`pandas.Series` of the y_training data

        - x_test: :class:`pandas.DataFrame` of the x_testing data

        - y_test: :class:`pandas.Series` of the y_testing data

        - rands: a :class:`tuple` of the (rs, rf) to seed the sample
        and features of the BaggingClassifier.  If `None`, then
        rands are generated and provided in the return `Series`

    RETURNS:

        :class:`pandas.Series` of the f1-scores and random seeds
    """
    #create a dictionary for the return values
    ret_d = {'train-f1':[], 'test-f1':[], 'rs':[], 'rf':[]}

    #use the randoms provided if there are any, otherwise generate them
    if not rands:
        rs =  numpy.random.rand()
        rf = numpy.random.rand()
        while rf < 0.1:
            rf = numpy.random.rand()
    else:
        rs, rf = rands[0], rands[1]
    #place them into the dictionary
    ret_d['rs'], ret_d['rf'] = rs, rf
    #create and run the bagging classifier
    bc = BaggingClassifier(base_estimator = base_estimator, n_estimators = 300,
                           max_samples = rs, max_features = rf, n_jobs = 1)

    bc.fit(x_train, y_train)
    y_hat_train = bc.predict(x_train)
    ret_d['train-f1'] = f1_score(y_train, y_hat_train)
    y_hat_test = bc.predict(x_test)
    ret_d['test-f1'] = f1_score(y_test, y_hat_test)
    return pandas.Series(ret_d)
Пример #31
0
    # validate_idx = list(range(num_sample))[int(num_sample*0.8):]

    x_test = data_test[fold]

    x_validate = data_train[fold][validate_idx]
    y_validate = label_train[validate_idx]
    w_validate = weight[validate_idx]

    x_train = data_train[fold][train_idx]
    y_train = label_train[train_idx]
    w_train = weight[train_idx]

    # train
    n_estimators = 50
    clf = BaggingClassifier(SVC(**params),
                            max_samples=1.0 / n_estimators,
                            n_estimators=n_estimators,
                            n_jobs=4)
    # clf = SVC(**params)

    clf.fit(x_train, y_train, sample_weight=w_train)

    # save
    with open('saved_models/SVC_{}/fold{}.pth'.format(timestr, fold),
              'wb') as f:
        pickle.dump(clf, f)

    # evaluate
    # on train set
    jigsawevaluator_train = utils.JigsawEvaluator(y_train,
                                                  y_identity[train_idx])
    train_pred = clf.predict_proba(x_train, )
Пример #32
0
    def test_run_and_upload(self):
        # This unit test is ment to test the following functions, using a varity of flows:
        # - openml.runs.run_task()
        # - openml.runs.OpenMLRun.publish()
        # - openml.runs.initialize_model()
        # - [implicitly] openml.setups.initialize_model()
        # - openml.runs.initialize_model_from_trace()
        task_id = 119  # diabates dataset
        num_test_instances = 253  # 33% holdout task
        num_folds = 1  # because of holdout
        num_iterations = 5  # for base search classifiers

        clfs = []
        random_state_fixtures = []

        lr = LogisticRegression()
        clfs.append(lr)
        random_state_fixtures.append('62501')

        pipeline1 = Pipeline(steps=[('scaler', StandardScaler(
            with_mean=False)), ('dummy', DummyClassifier(strategy='prior'))])
        clfs.append(pipeline1)
        random_state_fixtures.append('62501')

        pipeline2 = Pipeline(
            steps=[('Imputer', Imputer(
                strategy='median')), ('VarianceThreshold',
                                      VarianceThreshold()),
                   ('Estimator',
                    RandomizedSearchCV(DecisionTreeClassifier(), {
                        'min_samples_split': [2**x for x in range(1, 7 + 1)],
                        'min_samples_leaf': [2**x for x in range(0, 6 + 1)]
                    },
                                       cv=3,
                                       n_iter=10))])
        clfs.append(pipeline2)
        random_state_fixtures.append('62501')

        gridsearch = GridSearchCV(
            BaggingClassifier(base_estimator=SVC()), {
                "base_estimator__C": [0.01, 0.1, 10],
                "base_estimator__gamma": [0.01, 0.1, 10]
            })
        clfs.append(gridsearch)
        random_state_fixtures.append('62501')

        randomsearch = RandomizedSearchCV(
            RandomForestClassifier(n_estimators=5), {
                "max_depth": [3, None],
                "max_features": [1, 2, 3, 4],
                "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
                "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                "bootstrap": [True, False],
                "criterion": ["gini", "entropy"]
            },
            cv=StratifiedKFold(n_splits=2, shuffle=True),
            n_iter=num_iterations)

        clfs.append(randomsearch)
        # The random states for the RandomizedSearchCV is set after the
        # random state of the RandomForestClassifier is set, therefore,
        # it has a different value than the other examples before
        random_state_fixtures.append('12172')

        for clf, rsv in zip(clfs, random_state_fixtures):
            run = self._perform_run(task_id,
                                    num_test_instances,
                                    clf,
                                    random_state_value=rsv)

            # obtain accuracy scores using get_metric_score:
            accuracy_scores = run.get_metric_fn(sklearn.metrics.accuracy_score)
            # compare with the scores in user defined measures
            accuracy_scores_provided = []
            for rep in run.fold_evaluations['predictive_accuracy'].keys():
                for fold in run.fold_evaluations['predictive_accuracy'][
                        rep].keys():
                    accuracy_scores_provided.append(
                        run.fold_evaluations['predictive_accuracy'][rep][fold])
            self.assertEquals(sum(accuracy_scores_provided),
                              sum(accuracy_scores))

            if isinstance(clf, BaseSearchCV):
                if isinstance(clf, GridSearchCV):
                    grid_iterations = 1
                    for param in clf.param_grid:
                        grid_iterations *= len(clf.param_grid[param])
                    self.assertEqual(len(run.trace_content),
                                     grid_iterations * num_folds)
                else:
                    self.assertEqual(len(run.trace_content),
                                     num_iterations * num_folds)
                check_res = self._check_serialized_optimized_run(run.run_id)
                self.assertTrue(check_res)

            # todo: check if runtime is present
            self._check_fold_evaluations(run.fold_evaluations, 1, num_folds)
            pass
Пример #33
0
import pickle as pickle
from eeg_sandbox import *
import matplotlib.pyplot as plt

# BaggedSVM parameters chosen from gridsearch (deprecated)
C = 16
gamma = 0.0027472527472527475
# import X and y
X_save_path = './X_filtered_non_standardized.pkl'

X, y = load_X_and_y(X_save_path)
X, y = shuffle_X_and_y(X, y)
standardize_X(X)

# give a list of starting features, perhaps found using some other feature selection method.
# note this is optional - to use forward_selection alone simply omit the starting_features parameter
# when calling forward_selection.
starting_features = [30, 43, 49, 64, 108, 134, 159, 167, 200, 281, 299, 330]

# create model
model = BaggingClassifier(SVC(C=C, gamma=gamma),
                          n_estimators=500,
                          max_features=1.0)
#model = SVC(C=C, gamma=gamma)

# run algorithm and display selected features
picked_features, errors = forward_selection(
    X, y, model, 10, max_error=True, k=4, starting_features=starting_features)

print(picked_features)
Пример #34
0
def third_generation(X, y, size=200, seed=None):
    mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\
                                            [0, 0.2, 0.5, 0.9],
                                            [0.1, 0.3, 0.6]))
    mlp_clf = [
        MLPClassifier(hidden_layer_sizes=(h, ),
                      momentum=m,
                      learning_rate_init=a) for (h, m, a) in mlp_parameters
    ]
    mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters]

    neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 40)]
    weighting_methods = ['uniform', 'distance']
    knn_clf = [
        KNeighborsClassifier(n_neighbors=nn, weights=w)
        for (nn, w) in itertools.product(neigbhors_number, weighting_methods)
    ]
    knn_name = [
        'knn_{0}_{1}'.format(*param) for param in itertools.product(
            neigbhors_number, ['uniform', 'distance'])
    ]
    C = np.logspace(-3, 7, num=11)
    degree = [2, 3, 4]
    gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
    svm_clf_poly = [
        SVC(C=c, kernel='poly', degree=d)
        for (c, d) in itertools.product(C, degree)
    ]
    svm_clf_poly_name = [
        'svm_poly_{0}_{1}'.format(*param)
        for param in itertools.product(C, degree)
    ]
    svm_clf_rbf = [
        SVC(C=c, kernel='rbf', gamma=g)
        for (c, g) in itertools.product(C, gamma)
    ]
    svm_clf_rbf_name = [
        'svm_rbf_{0}_{1}'.format(*param)
        for param in itertools.product(C, gamma)
    ]

    dt_params = list(itertools.product(['gini', 'entropy'], \
                                       [1, 2, 3, 4, 5, None], \
                                       [None, 'sqrt', 'log2'], \
                                       ['best', 'random']))
    dt_clf = [
        DecisionTreeClassifier(criterion=c,
                               max_depth=d,
                               max_features=f,
                               splitter=s) for (c, d, f, s) in dt_params
    ]
    dt_name = ['dt_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params]

    et_clf = [
        ExtraTreeClassifier(criterion=c,
                            max_depth=d,
                            max_features=f,
                            splitter=s) for (c, d, f, s) in dt_params
    ]
    et_name = ['et_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params]

    ada_params = list(itertools.product([2**i for i in range(1, 14)], \
                                        [1, 2, 3]))
    ada_dt_clf = [
        AdaBoostClassifier(n_estimators=n,
                           base_estimator=DecisionTreeClassifier(max_depth=m))
        for (n, m) in ada_params
    ]
    ada_et_clf = [
        AdaBoostClassifier(n_estimators=n,
                           base_estimator=ExtraTreeClassifier(max_depth=m))
        for (n, m) in ada_params
    ]
    ada_dt_name = ['ada_dt_{0}_{1}'.format(*param) for param in ada_params]
    ada_et_name = ['ada_et_{0}_{1}'.format(*param) for param in ada_params]

    nb_bag_est = 50
    nb_bag_stumps = 200
    bag_dt = BaggingClassifier(n_estimators=nb_bag_est,
                               base_estimator=DecisionTreeClassifier())
    bag_et = BaggingClassifier(n_estimators=nb_bag_est,
                               base_estimator=ExtraTreeClassifier())
    bag_stumps = BaggingClassifier(
        n_estimators=nb_bag_stumps,
        base_estimator=DecisionTreeClassifier(max_depth=1))
    bag_dt.fit(X, y)
    bag_et.fit(X, y)
    bag_stumps.fit(X, y)
    dt_bag_clf = bag_dt.estimators_
    et_bag_clf = bag_et.estimators_
    stump_bag_clf = bag_stumps.estimators_
    dt_bag_name = ['dt_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)]
    et_bag_name = ['et_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)]
    stump_bag_name = [
        'stump_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_stumps)
    ]

    bag_dt_clf = [bag_dt]
    bag_et_clf = [bag_dt]
    bag_stump_clf = [bag_stumps]
    bag_dt_name = ['bag_dt_{0}'.format(str(nb_bag_est))]
    bag_et_name = ['bag_et_{0}'.format(str(nb_bag_est))]
    bag_stump_name = ['bag_stump_{0}'.format(str(200))]

    nb_rf = 15
    rf = RandomForestClassifier(n_estimators=nb_rf)
    rf.fit(X, y)
    dt_rf_clf = rf.estimators_
    dt_rf_name = ['dt_rf_{0}'.format(nb_est) for nb_est in range(nb_rf)]

    log_parameters = list(itertools.product(['l1', 'l2'],\
                                            np.logspace(-5, 9, num=15),
                                            [True, False]))
    log_clf = [
        LogisticRegression(penalty=l, C=c, fit_intercept=f)
        for (l, c, f) in log_parameters
    ]
    log_name = ['log_{0}_{1}_{2}'.format(*param) for param in log_parameters]

    sgd_parameters = list(
        itertools.product([
            'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',
            'squared_loss', 'huber', 'epsilon_insensitive',
            'squared_epsilon_insensitive'
        ], ['elasticnet'], [True, False], np.arange(0, 1.1, 0.1)))
    sgd_clf = [
        SGDClassifier(loss=l, penalty=p, fit_intercept=f, l1_ratio=l1)
        for (l, p, f, l1) in sgd_parameters
    ]
    sgd_name = [
        'sgd_{0}_{1}_{2}_{3}'.format(*param) for param in sgd_parameters
    ]

    pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_clf + et_clf + ada_dt_clf + ada_et_clf + \
                dt_bag_clf + et_bag_clf + stump_bag_clf + bag_dt_clf + bag_et_clf + bag_stump_clf + dt_rf_clf + \
                log_clf + sgd_clf
    pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_name + et_name + ada_dt_name + \
                ada_et_name + dt_bag_name + et_bag_name + stump_bag_name + bag_dt_name + bag_et_name + \
                bag_stump_name + dt_rf_name + log_name + sgd_name

    for model in pool:
        if not check_model_is_fitted(model, X[0, :].reshape((1, -1))):
            model.fit(X, y)

    np.random.seed(seed)
    order = np.random.permutation(range(len(pool)))
    estimators = [pool[i] for i in order[:size]]

    return estimators, pool_name
Пример #35
0
print("Accuracy score for training samples",
      accuracy_score(y_train, dt.predict(X_train)))
final_accuracy_scores_randomForest_gini.append([
    dt,
    confusion_matrix(y_test, dt.predict(X_test)),
    accuracy_score(y_test, dt.predict(X_test)),
    confusion_matrix(y_train, dt.predict(X_train)),
    accuracy_score(y_train, dt.predict(X_train))
])
from sklearn.model_selection import cross_val_score
print("K-Fold results for machine learning model : {} ".format(dt))
print(cross_val_score(dt, X_train, y_train, cv=10))
predicted_randomForest_gini = dt.predict(X_test)
predicted_randomForest_gini
final_accuracy_scores_Bagging = []
dt = BaggingClassifier()
dt.fit(X_train, y_train)
dt.predict(X_train)
dt.predict(X_test)
print("")
print(
    "---------------------------------------------------------------------------------------------------------"
)
print("For the machine learning model : {}".format(i))
print("Confusion matrix for test samples")
print(confusion_matrix(y_test, dt.predict(X_test)))
print("Accuracy score for test samples",
      accuracy_score(y_test, dt.predict(X_test)))
print("Confusion matrix for training samples")
print(confusion_matrix(y_train, dt.predict(X_train)))
print("Accuracy score for training samples",
Пример #36
0
# Computint prediction error
cart_error = np.mean((ypred - y_test)**2)

cart_verror = np.asarray([int(ypred[i] != y_test[i]) for i in range(0, ts)])
cart_error = np.sum(cart_verror)

print("🌲  ----------Decision Tree Classfication----------")
print(cart_error, "misclassified data out of", ts, "(", cart_error / ts,
      "%)\n")
# print ("")
'''--------------------
CART (Decision Tree) + Bagging
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
--------------------'''
bagb = BaggingClassifier(dtc,
                         n_estimators=30,
                         bootstrap_features=True,
                         bootstrap=bootstrap)
# adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=20,learning_rate=1.5,algorithm="SAMME")
bagb.fit(x_training, y_training)

# Predicting
bagb_pred = bagb.predict(x_test)

# Finding mispredicted samples
bagb_verror = np.asarray(
    [int(bagb_pred[i] != y_test[i]) for i in range(0, ts)])
bagb_error = np.sum(bagb_verror)
bagb_ccidx = np.where(bagb_verror == 0)
bagb_mcidx = np.where(bagb_verror == 1)

print("🌲  ----------Decision Tree Classfication + Bagging----------")
Пример #37
0
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=y)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
tree = DecisionTreeClassifier(criterion='entropy',
                              random_state=1,
                              max_depth=None)
bag = BaggingClassifier(base_estimator=tree,
                        n_estimators=500,
                        max_samples=1.0,
                        max_features=1.0,
                        bootstrap=True,
                        bootstrap_features=False,
                        n_jobs=1,
                        random_state=1)

# Run Decision Tree to see the accuracy of the classifier
from sklearn.metrics import accuracy_score
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f' %
      (tree_train, tree_test))

# Run Bagging classifier
Пример #38
0
def get_data(file_path):
    with open(file_path) as f:
        df = a2p.load(f)
        df = df.interpolate()
        input_features = df.drop(["defects@{false,true}"], axis=1)
        output_class = np.where(df["defects@{false,true}"] == 'true', 1, 0)
        return np.array(input_features), np.array(output_class)

    return   


X, y = get_data('../cm1.arff')
es = BaggingClassifier(base_estimator= Perceptron(max_iter=1000, class_weight = 'balanced'), 
                    n_estimators=100, 
                    max_samples=1.0, 
                    max_features=1.0, 
                    bootstrap=True,
                    bootstrap_features=False, 
                    n_jobs=4)

new_result = lambda : {'accuracy':[], 'roc_auc': [], 'gmean': [], 'f1':[]}
results = new_result()
k_neigh = 7
threshold = 0.4

#20 repetitions
for rep in range(1,6):
    skf = StratifiedKFold(n_splits=4)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
Пример #39
0
def train():
    # if os.path.exists('dataset/per_feature_matrix'):
    #     per_feature_matrix = pickle.load(open('dataset/per_feature_matrix', 'rb'))
    # else:
    start = time.time()
    print "extracting feature matrix..."
    if 1:
        per_feature_matrix = {}
        for each in os.listdir('dataset/per_feature'):
            path = os.path.join('dataset/per_feature/', each)
            per_feature_matrix = dict(pickle.load(open(path, 'rb')),
                                      **per_feature_matrix)
        per_feature_matrix = per_feature_matrix.values()
        pickle.dump(per_feature_matrix, open('dataset/per_feature_matrix',
                                             'wb'))

    # if os.path.exists('dataset/api_feature_matrix'):
    #     api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb'))
    # else:
    if 1:
        api_feature_matrix = {}
        for each in os.listdir('dataset/api_feature'):
            path = os.path.join('dataset/api_feature/', each)
            api_feature_matrix = dict(pickle.load(open(path, 'rb')),
                                      **api_feature_matrix)
        api_feature_matrix = api_feature_matrix.values()
        pickle.dump(api_feature_matrix, open('dataset/api_feature_matrix',
                                             'wb'))

    # if os.path.exists('dataset/ngram_feature_matrix'):
    #     ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb'))
    # else:
    if 1:
        ngram_feature_matrix = {}
        for each in os.listdir('dataset/ngram_feature'):
            path = os.path.join('dataset/ngram_feature/', each)
            ngram_feature_matrix = dict(pickle.load(open(path, 'rb')),
                                        **ngram_feature_matrix)
        ngram_feature_matrix = ngram_feature_matrix.values()
        pickle.dump(ngram_feature_matrix,
                    open('dataset/ngram_feature_matrix', 'wb'))

    classification = pickle.load(open('dataset/classification', 'rb'))
    if per_feature_matrix is not None and api_feature_matrix is not None and ngram_feature_matrix is not None:
        feature_matrix = _concatenate(per_feature_matrix, api_feature_matrix,
                                      ngram_feature_matrix)
    elif per_feature_matrix is not None:
        feature_matrix = per_feature_matrix
    elif api_feature_matrix is not None:
        feature_matrix = api_feature_matrix
    elif ngram_feature_matrix is not None:
        feature_matrix = ngram_feature_matrix
    else:
        return
    print "extracting feature matrix done."
    print "处理前样本总数:%d" % len(feature_matrix)

    #print len(feature_matrix)
    #print len(classification)

    features = 400
    fsmodel = SelectKBest(chi2, k=features)
    raw_feature_matrix = feature_matrix
    feature_matrix = fsmodel.fit_transform(feature_matrix, classification)

    pickle.dump(fsmodel, open('dataset/fsmodel', 'wb'))

    features = 300
    svc = SVC(kernel="linear", C=1)
    fsmodel2 = RFE(estimator=svc, n_features_to_select=features, step=1)

    #########################    DEBUG    ############################
    #classification = classification[7:]
    ##################################################################
    feature_matrix = fsmodel2.fit_transform(feature_matrix, classification)

    pickle.dump(fsmodel2, open('dataset/fsmodel2', 'wb'))

    #########################    DEBUG    ############################
    b_s = 5  #改这里也要改dl.py里面的默认值
    length = len(feature_matrix)
    feature_matrix = feature_matrix[length % b_s:]
    raw_feature_matrix = raw_feature_matrix[length % b_s:]
    classification = classification[length % b_s:]
    print "处理后样本总数:%d" % len(feature_matrix)
    ##################################################################

    #########################    DEBUG    ############################
    fs_vec = []
    for i in range(len(raw_feature_matrix[0])):
        fs_vec.append(i)  #构造值等于编号的特殊向量

    fs_vec = fsmodel.transform(fs_vec)
    #print fs_vec
    fs_vec = fsmodel2.transform(fs_vec)
    #print fs_vec

    feature_matrix_dl = [x for x in range(len(raw_feature_matrix))]
    for i in range(len(feature_matrix_dl)):
        feature_matrix_dl[i] = [
            x for x in range(len(raw_feature_matrix[0]) - features)
        ]
    temp = 0
    for i in range(len(raw_feature_matrix[0])):
        if i not in fs_vec:
            print "第%d列特征没有选用" % i
            for j in range(len(feature_matrix_dl)):
                feature_matrix_dl[j][temp] = raw_feature_matrix[j][i]
            temp = temp + 1

    #print "行数%d" % len(feature_matrix_dl)
    #print "列数%d" % len(feature_matrix_dl[0])
    #print feature_matrix_dl

    ##################################################################
    #hiddeny, da = test_dA(feature_matrix_dl, len(feature_matrix_dl[0]))
    # hiddeny2, test = test_dA(feature_matrix,len(feature_matrix[0]), batch_size=6, da_object = da)
    hiddeny, da = test_rbm(feature_matrix_dl, len(feature_matrix_dl[0]))
    #print len(feature_matrix)
    print "浅度特征数:%d" % len(feature_matrix[0])
    #print len(hiddeny)
    print "深度特征数:%d" % len(hiddeny[0])
    # print (hiddeny == hiddeny2).all()

    #固化深度训练器
    pickle.dump(da, open('dataset/rbmmodel', 'wb'))

    # 深度特征融合
    feature_matrix = numpy.concatenate((feature_matrix, hiddeny), axis=1)

    Z = []
    count = 0
    for i in feature_matrix:
        Z.append([])
        for j in i:
            Z[count].append(j)

        count += 1

    feature_matrix = Z

    # print feature_matrix

    Z = []
    for i in classification:
        Z.append(int(i))

    classification = Z

    if 1:
        per_feature_matrix2 = {}
        for each in os.listdir('test/per_feature'):
            path = os.path.join('test/per_feature/', each)
            per_feature_matrix2 = dict(pickle.load(open(path, 'rb')),
                                       **per_feature_matrix2)
        per_feature_matrix2 = per_feature_matrix2.values()
        pickle.dump(per_feature_matrix2, open('test/per_feature_matrix', 'wb'))

    # if os.path.exists('dataset/api_feature_matrix'):
    #     api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb'))
    # else:
    if 1:
        api_feature_matrix2 = {}
        for each in os.listdir('test/api_feature'):
            path = os.path.join('test/api_feature/', each)
            api_feature_matrix2 = dict(pickle.load(open(path, 'rb')),
                                       **api_feature_matrix2)
        api_feature_matrix2 = api_feature_matrix2.values()
        pickle.dump(api_feature_matrix2, open('test/api_feature_matrix', 'wb'))

    # if os.path.exists('dataset/ngram_feature_matrix'):
    #     ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb'))
    # else:
    if 1:
        ngram_feature_matrix2 = {}
        for each in os.listdir('test/ngram_feature'):
            path = os.path.join('test/ngram_feature/', each)
            ngram_feature_matrix2 = dict(pickle.load(open(path, 'rb')),
                                         **ngram_feature_matrix2)
        ngram_feature_matrix2 = ngram_feature_matrix2.values()
        pickle.dump(ngram_feature_matrix2,
                    open('test/ngram_feature_matrix', 'wb'))

    classification2 = pickle.load(open('test/classification', 'rb'))
    if per_feature_matrix2 is not None and api_feature_matrix2 is not None and ngram_feature_matrix2 is not None:
        feature_matrix2 = _concatenate(per_feature_matrix2,
                                       api_feature_matrix2,
                                       ngram_feature_matrix2)
    elif per_feature_matrix2 is not None:
        feature_matrix2 = per_feature_matrix2
    elif api_feature_matrix2 is not None:
        feature_matrix2 = api_feature_matrix2
    elif ngram_feature_matrix2 is not None:
        feature_matrix2 = ngram_feature_matrix2
    else:
        return
    print "extracting feature matrix done."
    print "处理前样本总数:%d" % len(feature_matrix2)

    #print len(feature_matrix)
    #print len(classification)

    features = 400
    fsmodel2 = SelectKBest(chi2, k=features)
    raw_feature_matrix2 = feature_matrix2
    feature_matrix2 = fsmodel.fit_transform(feature_matrix2, classification2)

    features2 = 300
    svc = SVC(kernel="linear", C=1)
    fsmodel2 = RFE(estimator=svc, n_features_to_select=features2, step=1)
    feature_matrix2 = fsmodel2.fit_transform(feature_matrix2, classification2)

    #########################    DEBUG    ############################
    b_s = 5  #改这里也要改dl.py里面的默认值
    length = len(feature_matrix2)
    feature_matrix2 = feature_matrix2[length % b_s:]
    raw_feature_matrix2 = raw_feature_matrix2[length % b_s:]
    classification2 = classification2[length % b_s:]
    print "处理后样本总数:%d" % len(feature_matrix2)
    ##################################################################

    #########################    DEBUG    ############################
    fs_vec2 = []
    for i in range(len(raw_feature_matrix2[0])):
        fs_vec2.append(i)  #构造值等于编号的特殊向量

    fs_vec2 = fsmodel.transform(fs_vec2)
    #print fs_vec
    fs_vec2 = fsmodel2.transform(fs_vec2)
    #print fs_vec

    feature_matrix_dl2 = [x for x in range(len(raw_feature_matrix2))]
    for i in range(len(feature_matrix_dl2)):
        feature_matrix_dl2[i] = [
            x for x in range(len(raw_feature_matrix2[0]) - features2)
        ]
    temp = 0
    for i in range(len(raw_feature_matrix2[0])):
        if i not in fs_vec2:
            print "第%d列特征没有选用" % i
            for j in range(len(feature_matrix_dl2)):
                feature_matrix_dl2[j][temp] = raw_feature_matrix2[j][i]
            temp = temp + 1

    hiddeny2, da = test_rbm(feature_matrix_dl2, len(feature_matrix_dl2[0]))
    #print len(feature_matrix)
    print "浅度特征数:%d" % len(feature_matrix2[0])
    #print len(hiddeny)
    print "深度特征数:%d" % len(hiddeny2[0])
    # print (hiddeny == hiddeny2).all()

    # 深度特征融合
    feature_matrix2 = numpy.concatenate((feature_matrix2, hiddeny2), axis=1)

    Z = []
    count = 0
    for i in feature_matrix2:
        Z.append([])
        for j in i:
            Z[count].append(j)

        count += 1

    feature_matrix2 = Z

    # print feature_matrix

    Z = []
    for i in classification2:
        Z.append(int(i))

    classification2 = Z
    '''
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    print "\nlearning with RF..."
    rf = RandomForestClassifier(n_estimators=300, min_samples_split=10)
    rf.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(rf, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with RF done.\n"
    pickle.dump(rf, open('dataset/model', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with GBDT..."
    gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0,
    max_depth=100, min_samples_split=10, random_state=0)
    gbdt.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(gbdt, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with GBDT done.\n"
    pickle.dump(gbdt, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with AdaBoost..."
    ada = AdaBoostClassifier(n_estimators=300)
    ada.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(ada, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with AdaBoost done.\n"
    pickle.dump(ada, open('dataset/model3', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with LogisticRegression..."
    lr = LogisticRegression()
    lr.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(lr, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with LogisticRegression done.\n"
    pickle.dump(lr, open('dataset/model4', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    print "\nlearning with RF..."
    rf = RandomForestClassifier(n_estimators=300, min_samples_split=10)
    rf.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(rf, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with RF done.\n"
    pickle.dump(rf, open('dataset/model', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with GBDT..."
    gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0,
    max_depth=100, min_samples_split=10, random_state=0)
    gbdt.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(gbdt, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with GBDT done.\n"
    pickle.dump(gbdt, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with AdaBoost..."
    ada = AdaBoostClassifier(n_estimators=300)
    ada.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(ada, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with AdaBoost done.\n"
    pickle.dump(ada, open('dataset/model3', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with LogisticRegression..."
    lr = LogisticRegression()
    lr.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(lr, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with LogisticRegression done.\n"
    pickle.dump(lr, open('dataset/model4', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    '''
    '''
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    print "\nlearning with SVC..."
    slffork=SVC(kernel='rbf',probability = True)
    slffork.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(slffork, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with SVC done.\n"
    pickle.dump(slffork, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    '''
    '''
    print "learning with BaggingClassifier..."
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    baggingfork = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5,max_features=0.5)
    baggingfork.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(baggingfork, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with BaggingClassifier done.\n"
    pickle.dump(baggingfork, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    '''
    '''kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)'''
    rf = RandomForestClassifier(n_estimators=300, min_samples_split=10)
    gbdt = GradientBoostingClassifier(n_estimators=300,
                                      learning_rate=1.0,
                                      max_depth=100,
                                      min_samples_split=10,
                                      random_state=0)
    ada = AdaBoostClassifier(n_estimators=300)
    #slf1=SVC(kernel='rbf',probability = True)
    bagging = BaggingClassifier(KNeighborsClassifier(),
                                max_samples=0.5,
                                max_features=0.5)

    print "learning with Voting Classifier..."
    vc = VotingClassifier(estimators=[('rf', rf), ('ada', ada),
                                      ('bagging', bagging), ('gbdt', gbdt)],
                          voting='soft',
                          weights=[1.5, 1.5, 1.3, 1.5])
    vc.fit(feature_matrix, classification)
    '''
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(vc, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    '''
    print "learning with Ensemble Classifier done.\n"
    pickle.dump(vc, open('dataset/model_final', 'wb'))  # 固化训练结果
    print 'time :%f' % (time.time() - start)
classReport(y_train, y_train_pred4, y_pred4, y_test)

#Cross fit metrics
scores4 = cross_val_score(clf4, features, target, cv=cv)
mean_score(scores4)
print_score(scores4)

######################################################################################
#using bagging
print('\nResult of Bagging Classifier')
#classifier and fit
#clf5 = BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=0.5)
clf5 = Pipeline([('scaler', StandardScaler()),
                 ('',
                  BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                    max_samples=0.5,
                                    max_features=1.0,
                                    n_estimators=20))])
clf5 = clf5.fit(X_train, y_train)

#prediction
y_train_pred5 = clf5.predict(X_train)
y_pred5 = clf5.predict(X_test)

#metrics creation
classReport(y_train, y_train_pred5, y_pred5, y_test)

#Cross fit metrics
scores5 = cross_val_score(clf5, features, target, cv=cv)
mean_score(scores5)
print_score(scores5)
Пример #41
0
acc11 = accuracy_score(y_test, final_pred1, normalize=True)
acc12 = accuracy_score(y_test, final_pred2)
acc13 = accuracy_score(y_test, final_pred3)
acc14 = accuracy_score(y_test, final_pred6)
acc15 = accuracy_score(y_test, final_pred7)
acc16 = accuracy_score(y_test, final_pred8)
acc17 = accuracy_score(y_test, final_pred9)

#Pre6=metrics.precision_score(y_test,pred6,average=None)

model = AdaBoostClassifier(random_state=1)
model.fit(x_train, y_train)
ac = model.score(x_test, y_test)

model12 = BaggingClassifier(DecisionTreeClassifier(random_state=1))
model12.fit(x_train, y_train)
ac12 = model12.score(x_test, y_test)

model13 = xgb.XGBClassifier(random_state=1, learning_rate=0.01)
#model13.fit(x_train,y_train)
#acc13=model13.score(x_test,y_test)

print("\n\n")
print("Random Forest :", end="")
print(acc1)
print("Kneighbors :", end="")
print(acc2)
print("SVM :", end="")
print(acc3)
print("Linear Regression :", end="")
Пример #42
0
def bagging(train_x, train_y, test_x, test_y, msno_df):
    print ("Bagging")
    clf = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(), max_samples=0.9, n_estimators=30, bootstrap=False)
    checkResult(clf, "Bagging", train_x, train_y, test_x, test_y, msno_df)
Пример #43
0
clf = AdaBoostClassifier()
clf.fit(x_train, y_train)

print("AdaBoost classifier")
print(clf.score(x_test, y_test))
print("\n")

#Bagging Classifier

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.1,
                                                    random_state=42)

clf = BaggingClassifier()
clf.fit(x_train, y_train)

print("Bagging classifier")
print(clf.score(x_test, y_test))
print("\n")

#ExtraTrees Classifier

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.1,
                                                    random_state=42)

clf = ExtraTreesClassifier()
clf.fit(x_train, y_train)
Пример #44
0
print('The cross validated score is',cross.mean())


# ## Bagging
# 
# Bagging is a general ensemble method. It works by applying similar classifiers on small partitions of the dataset and then taking the average of all the predictions. Due to the averaging,there is reduction in variance. Unlike Voting Classifier, Bagging makes use of similar classifiers.
# 
# #### Bagged KNN
# 
# Bagging works best with models with high variance. An example for this can be Decision Tree or Random Forests. We can use KNN with small value of **n_neighbours**, as small value of n_neighbours.

# In[ ]:


from sklearn.ensemble import BaggingClassifier
model=BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3),random_state=0,n_estimators=700)
model.fit(train_X,train_Y)
prediction=model.predict(test_X)
print('The accuracy for bagged KNN is:',metrics.accuracy_score(prediction,test_Y))
result=cross_val_score(model,X,Y,cv=10,scoring='accuracy')
print('The cross validated score for bagged KNN is:',result.mean())


# #### Bagged DecisionTree
# 

# In[ ]:


model=BaggingClassifier(base_estimator=DecisionTreeClassifier(),random_state=0,n_estimators=100)
model.fit(train_X,train_Y)
Пример #45
0
        GradientBoostingClassifier(),
        dict(learning_rate=[0.001, 0.01, 0.1],
             n_estimators=[10, 100, 1000],
             subsample=[0.5, 0.7, 1.0],
             max_depth=[3, 7, 9])
    ],
    'RidgeClassifier': [
        RidgeClassifier(),
        dict(alpha=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
    ],
    'LogisticRegression': [
        LogisticRegression(),
        dict(solver=['newton-cg', 'lbfgs', 'liblinear'],
             penalty=['l2'],
             C=[100, 10, 1.0, 0.1, 0.01])
    ],
    'BaggingClassifier':
    [BaggingClassifier(),
     dict(n_estimators=[10, 100, 1000])],
    'ANN': [
        MLPClassifier(),
        dict(activation=['identity', 'logistic', 'tanh', 'relu'],
             hidden_layer_sizes=[(i, j) for i in range(1, 101, 10)
                                 for j in range(1, 101, 10)])
    ]
}
which_algorithm_to_run = [
    'random forest', 'SVM', 'GB', 'RidgeClassifier', 'LogisticRegression',
    'BaggingClassifier', 'ANN'
]
Пример #46
0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    knn_class = KNeighborsClassifier().fit(X_train, y_train)
    knn_pred = knn_class.predict(X_test)
    print('=' * 64)
    print('Accuracy with only KNeighborsClassifier:',
          accuracy_score(knn_pred, y_test))

    #bag_class = BaggingClassifier(base_estimator=KNeighborsClassifier(),
    #                              n_estimators=50).fit(X_train, y_train)
    #bag_pred = bag_class.predict(X_test)
    #print(accuracy_score(bag_pred, y_test))
    #print('='*64)

    classifier = {
        'KNeighbors': KNeighborsClassifier(),
        'LogisticRegression': LogisticRegression(),
        'LinearSCV': LinearSVC(),
        'SVC': SVC(),
        'SGDC': SGDClassifier(),
        'DecisionTree': DecisionTreeClassifier(),
        'RandomTreeForest': RandomForestClassifier(random_state=0)
    }

    for name, estimator in classifier.items():
        bag_class = BaggingClassifier(base_estimator=estimator,
                                      n_estimators=30).fit(X_train, y_train)
        bag_pred = bag_class.predict(X_test)

        print(f'Accuracy Bagging with {name}:',
              accuracy_score(bag_pred, y_test))
Пример #47
0
svmendzeit = timeit.default_timer()
print("SVM zeit: ", svmendzeit - svmzeit)
print("Linear :", "Score 1: ", score_linear1, "Score 2: ", score_linear2,
      "Poly : ", "Score 1: ", score_poly1, "Score 2: ", score_poly2,
      "Score 1: ", score_gauss1, "Score 2 : ", score_gauss2)
print(" ")
#"""

# In[7]
"""Bagging """

#"""
startBag = timeit.default_timer()

model_linear = BaggingClassifier(base_estimator=svm_linear,
                                 n_estimators=50,
                                 random_state=1)
score_linear_bag1 = model_linear.fit(X1_train,
                                     y1_train).score(X1_test, y1_test)
score_linear_bag2 = model_linear.fit(X2_train,
                                     y2_train).score(X2_test, y2_test)
bag1 = timeit.default_timer()
print("linear bag score zeit : ", startBag - bag1)

model_poly = BaggingClassifier(base_estimator=svm_poly,
                               n_estimators=50,
                               random_state=1)
score_poly_bag1 = model_poly.fit(X1_train, y1_train).score(X1_test, y1_test)
score_poly_bag2 = model_poly.fit(X2_train, y2_train).score(X2_test, y2_test)
bag2 = timeit.default_timer()
print("poly bag score zeit", bag2 - bag1)
Пример #48
0
print(testData.head())

# Initialize the DSBox Encoder

hp = EncHyperparameter(text2int=True,
                       n_limit=12,
                       categorical_features='95in10')
enc = Encoder(hyperparams=hp)
enc.set_training_data(inputs=trainData)
enc.fit()

print(type(enc.get_params()))
print(enc.get_params())

imputer = Imputer()
model = BaggingClassifier()

print(trainData.columns)

encodedTrainData = enc.produce(inputs=trainData).value
processedTrainData = imputer.fit_transform(encodedTrainData)
trainedModel = model.fit(processedTrainData,
                         np.asarray(trainTargets[target_name]))

print(encodedTrainData.columns)  # encoded result

predictedTargets = trainedModel.predict(
    imputer.fit_transform(enc.produce(inputs=testData).value))

# Append the d3mindex column to the predicted targets
predictedTargets = pd.DataFrame({
Пример #49
0
def test_base():
    """Check BaseEnsemble methods."""
    ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators=3)

    iris = load_iris()
    ensemble.fit(iris.data, iris.target)
    ensemble.estimators_ = []  # empty the list and create estimators manually

    ensemble._make_estimator()
    ensemble._make_estimator()
    ensemble._make_estimator()
    ensemble._make_estimator(append=False)

    assert_equal(3, len(ensemble))
    assert_equal(3, len(ensemble.estimators_))

    assert_true(isinstance(ensemble[0], Perceptron))
Пример #50
0
import kfold_template

import pandas

# from sklearn import tree
from sklearn.ensemble import BaggingClassifier

dataset = pandas.read_csv("dataset.csv")

target = dataset.iloc[:, 30].values
data = dataset.iloc[:, 0:30].values

# print(target)
# print(data)

machine = BaggingClassifier(n_estimators=21)

r2_scores, accuracy_scores, confusion_matrices = kfold_template.run_kfold(
    3, data, target, machine, 1, 1)

print(r2_scores)
print(accuracy_scores)
for i in confusion_matrices:
    print(i)
Пример #51
0
                  "--size",
                  default=1000,
                  help="config the data set size",
                  action="store",
                  type="int",
                  dest="size")

(options, args) = parser.parse_args()

if options.size != DATASET_SIZE:
    dataset_size = options.size
else:
    dataset_size = DATASET_SIZE

bag_clf = BaggingClassifier(KNeighborsClassifier(),
                            max_samples=0.5,
                            max_features=0.5)

ada_clf = AdaBoostClassifier(n_estimators=5)

rdf_clf = RandomForestClassifier(n_estimators=5,
                                 criterion="gini",
                                 max_depth=None,
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.,
                                 max_features="auto",
                                 max_leaf_nodes=None,
                                 min_impurity_split=1e-7,
                                 bootstrap=True,
                                 oob_score=False,
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier
from vecstack import stacking
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from drop_highlycorelated import clf,xtrain,ytrain,xtest,ytest,X_important_train,X_important_test

models = [
    svm.SVC(kernel='linear',C=1),
        
    RandomForestClassifier(random_state=42, n_jobs=-1, 
                          n_estimators=1000, max_depth=3),
    BaggingClassifier(svm.SVC(kernel='linear',C=1))
]

S_train, S_test = stacking(models,                     # list of models
                           X_important_train, ytrain, X_important_test,      # data,            # regression task (if you need 
                                                       #     classification - set to False)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test 
                           regression=True,                     #     set in each fold and find mean
                           save_dir=None,              # do not save result and log (to save 
                                                       #     in current dir - set to '.')
                           metric=mean_absolute_error, # metric: callable
                           n_folds=4,                  # number of folds
                           shuffle=True,               # shuffle the data
                           random_state=0,             # ensure reproducibility
                           verbose=2)
Пример #53
0
def second_generation(X, y, seed=None):
    features = []
    ### 25 x 2 bagged trees
    bag_gini = BaggingClassifier(
        base_estimator=DecisionTreeClassifier(criterion='gini'),
        n_estimators=25,
        random_state=seed)
    bag_gini.fit(X, y)
    bag_gini_names = ['bag_gini_' + str(i) for i in range(25)]
    features.extend(
        [np.arange(X.shape[1]) for _ in range(len(bag_gini_names))])

    bag_entropy = BaggingClassifier(
        base_estimator=DecisionTreeClassifier(criterion='entropy'),
        n_estimators=25,
        random_state=3 * seed**2)
    bag_entropy.fit(X, y)
    bag_entropy_names = ['bag_entropy_' + str(i) for i in range(25)]
    features.extend(
        [np.arange(X.shape[1]) for _ in range(len(bag_entropy_names))])

    ### 25 x 2 random subspaces
    rs_gini = BaggingClassifier(
        base_estimator=DecisionTreeClassifier(criterion='gini'),
        n_estimators=25,
        max_features=int(np.sqrt(X.shape[1])),
        bootstrap=False,
        random_state=seed)
    rs_gini.fit(X, y)
    rs_gini_names = ['rs_gini_' + str(i) for i in range(25)]
    features.extend(rs_gini.estimators_features_)

    rs_entropy = BaggingClassifier(
        base_estimator=DecisionTreeClassifier(criterion='entropy'),
        n_estimators=25,
        max_features=int(np.sqrt(X.shape[1])),
        bootstrap=False,
        random_state=3 * seed**2)
    rs_entropy.fit(X, y)
    rs_entropy_names = ['rs_entropy_' + str(i) for i in range(25)]
    features.extend(rs_entropy.estimators_features_)

    ### 14 Ada
    nb_stumps = [2, 4, 8, 16, 32, 64, 128]
    ada_st_gini = [
        AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
            criterion='gini', max_depth=1),
                           n_estimators=st,
                           random_state=seed) for st in nb_stumps
    ]
    ada_st_gini_names = ['ada_st_gini_' + str(i) for i in nb_stumps]
    features.extend(
        [np.arange(X.shape[1]) for _ in range(len(ada_st_gini_names))])
    for clf in ada_st_gini:
        clf.fit(X, y)

    ada_st_entropy = [
        AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
            criterion='entropy', max_depth=1),
                           n_estimators=st,
                           random_state=3 * seed**2) for st in nb_stumps
    ]
    ada_st_entropy_names = ['ada_st_entropy_' + str(i) for i in nb_stumps]
    features.extend(
        [np.arange(X.shape[1]) for _ in range(len(ada_st_entropy_names))])
    for clf in ada_st_entropy:
        clf.fit(X, y)

    ### 8 Ada DT
    nb_dt = [2, 4, 8, 16]
    ada_dt_gini = [
        AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
            criterion='gini', max_depth=3),
                           n_estimators=dt,
                           random_state=seed) for dt in nb_dt
    ]
    ada_dt_gini_names = ['ada_dt_gini_' + str(i) for i in nb_dt]
    features.extend(
        [np.arange(X.shape[1]) for _ in range(len(ada_dt_gini_names))])
    for clf in ada_dt_gini:
        clf.fit(X, y)

    ada_dt_entropy = [
        AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
            criterion='entropy', max_depth=3),
                           n_estimators=st,
                           random_state=3 * seed**2) for dt in nb_dt
    ]
    ada_dt_entropy_names = ['ada_dt_entropy_' + str(i) for i in nb_dt]
    features.extend(
        [np.arange(X.shape[1]) for _ in range(len(ada_dt_entropy_names))])
    for clf in ada_dt_entropy:
        clf.fit(X, y)

    ### 24 ANN
    mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\
                                            [0, 0.2, 0.5, 0.9]))
    mlp_clf = [
        MLPClassifier(hidden_layer_sizes=(h, ), momentum=m)
        for (h, m) in mlp_parameters
    ]
    for clf in mlp_clf:
        clf.fit(X, y)
    mlp_name = ['mlp_{0}_{1}'.format(*param) for param in mlp_parameters]
    features.extend([np.arange(X.shape[1]) for _ in range(len(mlp_name))])

    ### 54 SVM
    C = np.logspace(-3, 2, num=6)
    gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]

    svm_linear = [SVC(C=c, kernel='poly', degree=1) for c in C]
    for clf in svm_linear:
        clf.fit(X, y)
    svm_linear_names = ['svm_linear_' + str(c) for c in C]
    features.extend(
        [np.arange(X.shape[1]) for _ in range(len(svm_linear_names))])

    svm_rbf = [SVC(C=c, gamma=g) for c, g in itertools.product(C, gamma)]
    for clf in svm_rbf:
        clf.fit(X, y)
    svm_rbf_names = [
        'svm_rbf_{0}_{1}'.format(*param)
        for param in itertools.product(C, gamma)
    ]
    features.extend([np.arange(X.shape[1]) for _ in range(len(svm_rbf_names))])

    pool = bag_gini.estimators_ + bag_entropy.estimators_ + rs_gini.estimators_ + rs_entropy.estimators_ + \
           ada_st_gini + ada_st_entropy + ada_dt_gini + ada_dt_entropy + mlp_clf + svm_linear + svm_rbf

    pool_name = bag_gini_names + bag_entropy_names + rs_gini_names + rs_entropy_names + ada_st_gini_names + \
                ada_st_entropy_names + ada_dt_gini_names + ada_dt_entropy_names + mlp_name + svm_linear_names + \
                svm_rbf_names

    return pool, pool_name, features
Пример #54
0
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=.30,random_state=355)


# In[46]:


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier


# In[49]:


bag_decision = BaggingClassifier(DecisionTreeClassifier()) 


# In[50]:


bag_decision.fit(X_train, y_train)


# In[51]:


bag_decision.score(X_test, y_test)


# In[52]:
Пример #55
0
column_names = ["Sample Code Number","Clump Thickness","Uniformity of Cell Size"\
,"Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size"\
,"Bare Nuclei","Bland Chromatin","Normal Nuclei","Mitoses","Class Label"]
cancer_data.columns = column_names
del cancer_data["Sample Code Number"]
cancer_data = cancer_data.iloc[np.random.permutation(len(cancer_data))]
class_labels = cancer_data["Class Label"]
del cancer_data["Class Label"] 

train_data,test_data,train_labels,test_labels = cross_validation.train_test_split(cancer_data,class_labels,test_size=0.3)

#Initializing classifiers 

rf = RandomForestClassifier(n_estimators=101)
ada = AdaBoostClassifier(n_estimators=101)
bagging = BaggingClassifier(n_estimators=101)
grad_boost = GradientBoostingClassifier(n_estimators=101)
mnb = MultinomialNB()
gnb = GaussianNB()
bnb = BernoulliNB()
brm = BernoulliRBM()
percept = Perceptron()
svm = SVC()
knn = KNeighborsClassifier(n_neighbors=5)
radnn = RadiusNeighborsClassifier(radius=10.3)

classifiers = [rf,ada,bagging,grad_boost,mnb,gnb,bnb,percept,svm,knn,radnn]
classifier_names = ["Random Forests","Adaboost","Bagging","Gradient Boost","Multinomial NB"\
,"Gaussian NB","Bernoulli NB","Perceptron","SVM (RBF)","KNN (K=5)","RadiusNN(r=10.3)"]

for classifier,classifier_name in zip(classifiers,classifier_names):
Пример #56
0
In sklearn, you can evaluate the OOB accuracy of an ensemble classifier by setting the parameter oob_score to True during instantiation. After training the classifier, the OOB accuracy can be obtained by accessing the .oob_score_ attribute from the corresponding instance.

In your environment, we have made available the class DecisionTreeClassifier from sklearn.tree.

Instructions
100 XP
Import BaggingClassifier from sklearn.ensemble.

Instantiate a DecisionTreeClassifier with min_samples_leaf set to 8.

Instantiate a BaggingClassifier consisting of 50 trees and set oob_score to True."""
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier

# Instantiate dt
dt = DecisionTreeClassifier(min_samples_leaf=8, random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, 
                       n_estimators=50,
                       oob_score=True,
                       random_state=1)


                    """DEVELOPER"""
                """BasitAminBhatti"""
                    """Github"""
        """https://github.com/basitaminbhatti"""
Пример #57
0
from sklearn.model_selection import train_test_split, GridSearchCV

#accuracy = []
#for i in range(200):
#    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90)
#    clf = tree.DecisionTreeClassifier()
#    clf.fit(X_train, y_train)
#    Z = clf.predict(X_test)
#    accuracy.append(clf.score(X_test,y_test))
#print(np.mean(accuracy))
#print(np.std(accuracy))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90)
clf = BaggingClassifier(tree.DecisionTreeClassifier(),
                        max_samples=0.5,
                        max_features=0.5,
                        n_estimators=200)
clf.fit(X_train, y_train)
Z = clf.predict(X_test)
print(clf.score(X_test, y_test))

#plotAccuracyBagging(X, y, nbLoop=201)

#arbre = BaggingClassifier(tree.DecisionTreeClassifier(), n_estimators=200)
#parameters = {'max_samples' : np.random.uniform(0,1,10), 'max_features' : np.random.uniform(0,1,10)}
#clf = GridSearchCV(arbre, parameters, cv=5)
#clf.fit(X_train, y_train)
#print(clf.best_params_)

clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)
Пример #58
0
x = dataset.values[:, 0:10]
y = dataset.values[:, 10]

x_train, x_test, y_train, y_test = model_selection.train_test_split(
    x, y, train_size=0.7, test_size=0.3, random_state=0, stratify=y)

models = [('Logistic Regression',
           LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)),
          ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
          ('Support Vector Machine',
           SVC(random_state=0, gamma='scale', probability=True)),
          ('Naive-Bayes Classifier', GaussianNB()),
          ('K-Nearest Neighborhood', KNeighborsClassifier(n_neighbors=11)),
          ('Basic Decision Tree', DecisionTreeClassifier(random_state=0)),
          ('Bagged Tree', BaggingClassifier(random_state=0)),
          ('Boosted Tree', GradientBoostingClassifier(random_state=0)),
          ('Random Forest',
           RandomForestClassifier(random_state=0, n_estimators=100))]

accuracy_list = []
cv_score_list = []
roc_auc_list = []
i = 0

print(
    "Model Name                     Accuracy       CV Score       ROC-AUC Score"
)
print(
    "----------                     --------       --------       -------------"
)
Пример #59
0
class _BaggingClassifierImpl:
    def __init__(
        self,
        base_estimator=None,
        n_estimators=10,
        *,
        max_samples=1.0,
        max_features=1.0,
        bootstrap=True,
        bootstrap_features=False,
        oob_score=False,
        warm_start=False,
        n_jobs=None,
        random_state=None,
        verbose=0,
    ):
        estimator_impl = base_estimator

        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "max_samples": max_samples,
            "max_features": max_features,
            "bootstrap": bootstrap,
            "bootstrap_features": bootstrap_features,
            "oob_score": oob_score,
            "warm_start": warm_start,
            "n_jobs": n_jobs,
            "random_state": random_state,
            "verbose": verbose,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y, sample_weight=None):
        if isinstance(X, pd.DataFrame):
            feature_transformer = FunctionTransformer(
                func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns),
                inverse_func=None,
                check_inverse=False,
            )
            self._hyperparams["base_estimator"] = (
                feature_transformer >> self._hyperparams["base_estimator"]
            )
            self._wrapped_model = SKLModel(**self._hyperparams)
        self._wrapped_model.fit(X, y, sample_weight)

        return self

    def predict(self, X, **predict_params):
        return self._wrapped_model.predict(X, **predict_params)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def predict_log_proba(self, X):
        return self._wrapped_model.predict_log_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
Пример #60
0
    predict_test = estimator.predict(test)
    #*** checking results
    print('the in-bag prediction accuracy rate is:',
          (y_data == predict_data).sum() / float(y_data.shape[0]))
    #*** output the result
    print('save the result in test_result.csv file.......')
    output = pd.read_csv('sample_submission.csv')
    output.loc[:, 'Label'] = predict_test
    output.to_csv('test_result.csv', index=False)
    total.loc[:, 'knn'] = predict_test

#*************** bagging model:
#different types of pipline, with pca or not
bagging_pip = Pipeline([
    ('scaler', ScalingByRange()), ('pca', PCA()),
    ('bagging', BaggingClassifier(SVC(C=9, gamma=0.04, kernel='rbf')))
])  #max_depth=10,

# bagging_cv = True
bagging_cv = False
# bagging_cv = None

if bagging_cv is None:
    print('we skip the bagging model training at this time.......')

elif bagging_cv:
    #************ select partial of train to improve speed
    # train = train.iloc[0:5000,:]
    # y_train = y_train.iloc[0:5000]

    #************ test the parameter setting manually here