예제 #1
0
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
예제 #3
0
def test_fit_base_estimators_false():
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()

    clf1.fit(X, y)
    clf2.fit(X, y)
    clf3.fit(X, y)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  fit_base_estimators=False)

    eclf.fit(X, y)
    assert round(eclf.score(X, y), 2) == 0.97
예제 #4
0
                                       n_jobs=-1)
random_forest.fit(X_train, Y_train)
Y_pred_rf = random_forest.predict(X_test)
#random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print('acc_rf = ' + str(acc_random_forest))

rf = RandomForestClassifier(n_jobs=-1)
eclf = EnsembleVoteClassifier(clfs=[
    random_forest, decision_tree, Ada_Boost, sgd, linear_svc, perceptron,
    gaussian, knn, svc, logreg
],
                              weights=[4, 1, 0, 2, 1, 1, 1, 1, 0, 1])
eclf.fit(X_train, Y_train)
Y_pred = eclf.predict(X_test)
acc_ensemble_vote = round(eclf.score(X_train, Y_train) * 100, 2)
print('acc_ensemble_vote = ' + str(acc_ensemble_vote))

base_predictions_train = pd.DataFrame({
    'RandomForest':
    Y_pred_rf.ravel(),
    'DecisionTree':
    Y_pred_dt.ravel(),
    'AdaBoost':
    Y_pred_adab.ravel(),
    'SGD':
    Y_pred_sgd.ravel(),
    'Linear SVC':
    Y_pred_lsvc.ravel(),
    'Perceptron':
    Y_pred_perc.ravel(),
results_acc['svc'] = accscorsv
results_f1['svc']  = f1scorsv
#########################Boosting#################################

log = LogisticRegression(solver='lbfgs', class_weight='balanced')
ada = AdaBoostClassifier(n_estimators=5, base_estimator=log)
grad_boost = GradientBoostingClassifier(n_estimators=100)
xgb = XGBClassifier(max_depth=8, learning_rate=0.001, use_label_encoder=False)

ensemble = EnsembleVoteClassifier(clfs = [ada, grad_boost, xgb], voting='hard')

ensemble.fit(X_train, y_train)

y_preden = ensemble.predict(X_test)
f1scoren = metrics.f1_score(y_test, y_preden)
accscoren = ensemble.score(X_test, y_test)
results_acc['ensemble'] = accscoren
results_f1['ensemble']  = f1scoren

print(classification_report(y_test, y_pred))
plot_conf(ensemble)
###############################################################################

naive = GaussianNB(var_smoothing=2e-9)
naive.fit(X_train, y_train)

y_pred  = naive.predict(X_test)
f1scornb = metrics.f1_score(y_test, y_pred)
accscornb = naive.score(X_test, y_test)
results_acc['NB']  = accscornb
results_f1['NB']   = f1scornb
#clf_MNB= MNB()
eclf = EnsembleVoteClassifier(clfs=[clf_RF, clf_ET, clf_svc, clf_DT],
                              weights=[1, 1, 1, 1])

labels = [
    'Random Forest', 'Extra Trees', 'Support Vector', 'Decision Tree',
    'Ensemble Vote'
]
for clf, label in zip([clf_RF, clf_ET, clf_svc, clf_DT, eclf], labels):

    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
          (scores.mean(), scores.std(), label))

eclf.fit(X_train, y_train)
confidence = eclf.score(X_test, y_test)
print(confidence)

example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1]])
example_measures = example_measures.reshape(len(example_measures), -1)
prediction = eclf.predict(example_measures)
print(prediction)

col_dict = dict(list(enumerate(df.columns)))
col_dict

X = np.array(df.drop(['class'], 1), dtype=np.float64)
y = np.array(df['class'], dtype=np.int64)
plot_decision_regions(
    X=X,
    y=y,
예제 #7
0
class ExtendedBaggingClassifier:
    def __init__(self, voting="hard", verbose=False, parallel=True, target_name='target'):
        self.models = []
        self.temporary_models = []
        self.voting = voting
        self.predictions = []
        self.votingClassifier = None
        self.verbose = verbose
        self.parallel = parallel
        self.target_name = target_name

    def _get_models(self):
        base_models = []
        for model in self.models:
            base_models.append(model.model)
        return base_models

    def add_models(self, model, params):
        """
        Create all the possible combinations of the model with given parameters.
        Usage example:
            params = {
                'C': np.logspace(0, 4, num=10),
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            }

            custom_bagging = CustomBaggingClassifier(verbose=True, parallel=True)
            custom_bagging.add_models(LogisticRegression, params)

        :param model: The name of the model (passed without calling the constructor) that is intended to be used
        :param params: key-value pairs of hyperparameters that will be used to generate all the possible models
        :return: the number of models of the ensemble
        """
        if self.votingClassifier is not None:
            self.votingClassifier = None
        keys = list(params)
        for values in itertools.product(*map(params.get, keys)):
            model_instance = model(**dict(zip(keys, values)))
            self.temporary_models.append((str(model_instance), model_instance))
        return len(self.temporary_models)

    def add_model(self, model):
        """
        Add a model to the ensemble
        :param model: instance of the model
        :return: ---
        See also :add_models.
        """
        if self.votingClassifier is not None:
            self.votingClassifier = None
        self.temporary_models.append((str(model), model))
        return len(self.temporary_models)

    def _commit_single_model(self, n_samples, temp_model):
        """
        train_set, oob_set = self._generate_bootstrap_sample(Xy)
        return BaseModel(temp_model[0], temp_model[1], train_set, oob_set, self.target_name)
        """
        sampled_idx, unsampled_idx = self._generate_indexes(len(self.temporary_models), n_samples)
        return BaseModelIdx(temp_model[0], temp_model[1], sampled_idx, unsampled_idx, self.target_name)

    def _commit_models(self, X, y):
        """
        Create indexes sets for train and oob validation sets.
        """
        if X.shape[0] != y.shape[0]:
            raise ValueError('It seems that target values (y) are not the same as feature values (X)')

        if self.parallel:
            pool = multiprocessing.Pool(processes=None)
            f = partial(self._commit_single_model, X.shape[0])
            self.models = pool.map(f, self.temporary_models)
            pool.close()
            pool.join()
        else:
            for temp_model in self.temporary_models:
                self.models.append(self._commit_single_model(X.shape[0], temp_model))

    def _fit_single_model(self, X, y, single_model):
        return single_model.fit(X, y)

    def fit(self, X, y):
        """
        Train all the models in the ensemble.
        :param X: Features values of trainset
        :param y: Target values of trainset
        :return: ---
        """
        # self._commit_models(X, y)
        if self.parallel:
            pool = multiprocessing.Pool(processes=None)
            f = partial(self._fit_single_model, X, y)
            self.models = pool.map(f, self.models)
            pool.close()
            pool.join()
        else:
            for model in self.models:
                self._fit_single_model(X, y, model)
        self.votingClassifier = EnsembleVoteClassifier(clfs=self._get_models(), voting=self.voting, refit=False)
        self.votingClassifier.fit(X, y)

    def _predict_single_model(self, X, model):
        return model.name, model.predict(X)

    def predict_each_model(self, X):
        """
        Perform a prediction for each model in the ensemble. NOTE! fit(X,y) is required before.
        :param X: Features dataframe to be used for predictions
        :return: List of predictions with model name associated
        """
        if len(self.models) == 0:
            raise ValueError('Probably fit(X,y) method was not called before. Call it!')
        predictions = []
        if self.parallel:
            pool = multiprocessing.Pool(processes=None)
            f = partial(self._predict_single_model, X)
            predictions = pool.map(f, self.models)
            pool.close()
            pool.join()
        else:
            for model in self.models:
                predictions.append(self._predict_single_model(model, X))
        return predictions

    def score(self, X, y):
        """
        Get the score given X as features values and y as target values. Useful for validation/testing purposes.
        :param X: Features dataframe of trainset
        :param y: Target dataframe of trainset
        :return: score
        """
        return self.votingClassifier.score(X, y)

    def predict(self, X):
        """
        Perform a prediction considering the models as an ensemble. NOTE! train_models() must be called before getting the
        predictions
        :param X: input values to be used for predictions
        :return: list of predictions with model name associated
        """
        return self.votingClassifier.predict(X)

    def _get_single_oob(self, X, y, model):
        return model.name, model.score(X, y)

    def models_oob_score(self, X, y):
        '''
        Computes the OOB score for each model in the ensemble
        :return: list of OOB scores, one for each model in the ensemble
        '''

        oob_scores = []
        if self.parallel:
            pool = multiprocessing.Pool(processes=None)
            f = partial(self._get_single_oob, X, y)
            oob_scores = pool.map(f, self.models)
            pool.close()
            pool.join()
        else:
            for model in self.models:
                oob_scores.append((self._get_single_oob(X, y, model)))
        return oob_scores

    def _ret_accuracy(self, array):
        return array[1]

    def best_model(self, X, y):
        '''
        Find the best model comparing performances over OOB set
        :return: the model with the best OOB score
        '''
        performances = self.models_oob_score(X, y)
        performances.sort(key=self._ret_accuracy, reverse=False)
        return performances.pop()

    '''def _generate_bootstrap_sample(self, X):
        df_boot = X.sample(n=X.shape[0], replace=True, random_state=randint(0, 10000))
        oob = pd.concat([df_boot, X]).drop_duplicates(keep=False)
        if self.verbose is True:
            print("OOB set size: %.2f" % float(oob.shape[0] / df_boot.shape[0] * 100), "%")
            print("OOB set abs.:   %i" % oob.shape[0])
        return df_boot, oob'''

    def _generate_indexes(self, num_models, n_samples):
        rand_state = randint(0, num_models)
        sampled_idxs   = self._generate_sample_indices(rand_state, n_samples)
        unsampled_idxs = self._generate_unsampled_indices(rand_state, n_samples)
        return sampled_idxs, unsampled_idxs

    def _generate_unsampled_indices(self, random_state, n_samples):
        sample_indices = self._generate_sample_indices(random_state, n_samples)
        sample_counts = np.bincount(sample_indices, minlength=n_samples)
        unsampled_mask = sample_counts == 0
        indices_range = np.arange(n_samples)
        unsampled_indices = indices_range[unsampled_mask]
        return unsampled_indices

    def _generate_sample_indices(self, random_state, n_samples):
        random_instance = self._check_random_state(random_state)
        sample_indices = random_instance.randint(0, n_samples, n_samples)

        return sample_indices

    def _check_random_state(self, seed):
        if isinstance(seed, numbers.Integral):
            return np.random.RandomState(seed)
        if isinstance(seed, np.random.RandomState):
            return seed
        raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
                         ' instance' % seed)
                     ('encoder', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(remainder='drop',
                                 transformers=[('numerical', num_pipe_4,
                                                num_feat),
                                               ('categorical', cat_pipe,
                                                categorical_feat)])

model_4 = Pipeline([('preprocessor', preprocessor),
                    ('classifier', AdaBoostClassifier())])

model_7 = Pipeline([('preprocessor', preprocessor),
                    ('pca', TruncatedSVD(n_components=5)),
                    ('classifier',
                     XGBClassifier(objective='multi:softmax',
                                   booster='gbtree',
                                   nrounds='min.error.idx',
                                   num_class=4,
                                   maximize=False,
                                   eval_metric='merror',
                                   eta=.2,
                                   max_depth=14,
                                   colsample_bytree=.4))])
model_3 = Pipeline([('preprocessor', preprocessor),
                    ('classifier', GradientBoostingClassifier())])

eclf = EnsembleVoteClassifier(clfs=[model_4, model_7, model_3], voting='hard')
eclf.fit(X_train, y_train)
eclf_score = eclf.score(X_train, y_train)

make_submission(eclf, X_test)
예제 #9
0
                              weights=[2, 1, 1],
                              voting='soft')

# In[85]:

clf1.fit(X, y)
clf2.fit(X, y)
clf3.fit(X, y)
eclf.fit(X, y)

# In[86]:

print(clf1.score(X, y))
print(clf2.score(X, y))
print(clf3.score(X, y))
print(eclf.score(X, y))

# Select the Features for Decision Boundaries Plot

# In[87]:

# select only 2 features in dataset for plot boundaries
X_array = X.iloc[:, 0:2]

# In[88]:

y_ = pd.DataFrame.as_matrix(y)
X_ = pd.DataFrame.as_matrix(X_array)
X_.shape
y_.shape