def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def test_fit_base_estimators_false(): np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() clf1.fit(X, y) clf2.fit(X, y) clf3.fit(X, y) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', fit_base_estimators=False) eclf.fit(X, y) assert round(eclf.score(X, y), 2) == 0.97
n_jobs=-1) random_forest.fit(X_train, Y_train) Y_pred_rf = random_forest.predict(X_test) #random_forest.score(X_train, Y_train) acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2) print('acc_rf = ' + str(acc_random_forest)) rf = RandomForestClassifier(n_jobs=-1) eclf = EnsembleVoteClassifier(clfs=[ random_forest, decision_tree, Ada_Boost, sgd, linear_svc, perceptron, gaussian, knn, svc, logreg ], weights=[4, 1, 0, 2, 1, 1, 1, 1, 0, 1]) eclf.fit(X_train, Y_train) Y_pred = eclf.predict(X_test) acc_ensemble_vote = round(eclf.score(X_train, Y_train) * 100, 2) print('acc_ensemble_vote = ' + str(acc_ensemble_vote)) base_predictions_train = pd.DataFrame({ 'RandomForest': Y_pred_rf.ravel(), 'DecisionTree': Y_pred_dt.ravel(), 'AdaBoost': Y_pred_adab.ravel(), 'SGD': Y_pred_sgd.ravel(), 'Linear SVC': Y_pred_lsvc.ravel(), 'Perceptron': Y_pred_perc.ravel(),
results_acc['svc'] = accscorsv results_f1['svc'] = f1scorsv #########################Boosting################################# log = LogisticRegression(solver='lbfgs', class_weight='balanced') ada = AdaBoostClassifier(n_estimators=5, base_estimator=log) grad_boost = GradientBoostingClassifier(n_estimators=100) xgb = XGBClassifier(max_depth=8, learning_rate=0.001, use_label_encoder=False) ensemble = EnsembleVoteClassifier(clfs = [ada, grad_boost, xgb], voting='hard') ensemble.fit(X_train, y_train) y_preden = ensemble.predict(X_test) f1scoren = metrics.f1_score(y_test, y_preden) accscoren = ensemble.score(X_test, y_test) results_acc['ensemble'] = accscoren results_f1['ensemble'] = f1scoren print(classification_report(y_test, y_pred)) plot_conf(ensemble) ############################################################################### naive = GaussianNB(var_smoothing=2e-9) naive.fit(X_train, y_train) y_pred = naive.predict(X_test) f1scornb = metrics.f1_score(y_test, y_pred) accscornb = naive.score(X_test, y_test) results_acc['NB'] = accscornb results_f1['NB'] = f1scornb
#clf_MNB= MNB() eclf = EnsembleVoteClassifier(clfs=[clf_RF, clf_ET, clf_svc, clf_DT], weights=[1, 1, 1, 1]) labels = [ 'Random Forest', 'Extra Trees', 'Support Vector', 'Decision Tree', 'Ensemble Vote' ] for clf, label in zip([clf_RF, clf_ET, clf_svc, clf_DT, eclf], labels): scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) eclf.fit(X_train, y_train) confidence = eclf.score(X_test, y_test) print(confidence) example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1]]) example_measures = example_measures.reshape(len(example_measures), -1) prediction = eclf.predict(example_measures) print(prediction) col_dict = dict(list(enumerate(df.columns))) col_dict X = np.array(df.drop(['class'], 1), dtype=np.float64) y = np.array(df['class'], dtype=np.int64) plot_decision_regions( X=X, y=y,
class ExtendedBaggingClassifier: def __init__(self, voting="hard", verbose=False, parallel=True, target_name='target'): self.models = [] self.temporary_models = [] self.voting = voting self.predictions = [] self.votingClassifier = None self.verbose = verbose self.parallel = parallel self.target_name = target_name def _get_models(self): base_models = [] for model in self.models: base_models.append(model.model) return base_models def add_models(self, model, params): """ Create all the possible combinations of the model with given parameters. Usage example: params = { 'C': np.logspace(0, 4, num=10), 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga'] } custom_bagging = CustomBaggingClassifier(verbose=True, parallel=True) custom_bagging.add_models(LogisticRegression, params) :param model: The name of the model (passed without calling the constructor) that is intended to be used :param params: key-value pairs of hyperparameters that will be used to generate all the possible models :return: the number of models of the ensemble """ if self.votingClassifier is not None: self.votingClassifier = None keys = list(params) for values in itertools.product(*map(params.get, keys)): model_instance = model(**dict(zip(keys, values))) self.temporary_models.append((str(model_instance), model_instance)) return len(self.temporary_models) def add_model(self, model): """ Add a model to the ensemble :param model: instance of the model :return: --- See also :add_models. """ if self.votingClassifier is not None: self.votingClassifier = None self.temporary_models.append((str(model), model)) return len(self.temporary_models) def _commit_single_model(self, n_samples, temp_model): """ train_set, oob_set = self._generate_bootstrap_sample(Xy) return BaseModel(temp_model[0], temp_model[1], train_set, oob_set, self.target_name) """ sampled_idx, unsampled_idx = self._generate_indexes(len(self.temporary_models), n_samples) return BaseModelIdx(temp_model[0], temp_model[1], sampled_idx, unsampled_idx, self.target_name) def _commit_models(self, X, y): """ Create indexes sets for train and oob validation sets. """ if X.shape[0] != y.shape[0]: raise ValueError('It seems that target values (y) are not the same as feature values (X)') if self.parallel: pool = multiprocessing.Pool(processes=None) f = partial(self._commit_single_model, X.shape[0]) self.models = pool.map(f, self.temporary_models) pool.close() pool.join() else: for temp_model in self.temporary_models: self.models.append(self._commit_single_model(X.shape[0], temp_model)) def _fit_single_model(self, X, y, single_model): return single_model.fit(X, y) def fit(self, X, y): """ Train all the models in the ensemble. :param X: Features values of trainset :param y: Target values of trainset :return: --- """ # self._commit_models(X, y) if self.parallel: pool = multiprocessing.Pool(processes=None) f = partial(self._fit_single_model, X, y) self.models = pool.map(f, self.models) pool.close() pool.join() else: for model in self.models: self._fit_single_model(X, y, model) self.votingClassifier = EnsembleVoteClassifier(clfs=self._get_models(), voting=self.voting, refit=False) self.votingClassifier.fit(X, y) def _predict_single_model(self, X, model): return model.name, model.predict(X) def predict_each_model(self, X): """ Perform a prediction for each model in the ensemble. NOTE! fit(X,y) is required before. :param X: Features dataframe to be used for predictions :return: List of predictions with model name associated """ if len(self.models) == 0: raise ValueError('Probably fit(X,y) method was not called before. Call it!') predictions = [] if self.parallel: pool = multiprocessing.Pool(processes=None) f = partial(self._predict_single_model, X) predictions = pool.map(f, self.models) pool.close() pool.join() else: for model in self.models: predictions.append(self._predict_single_model(model, X)) return predictions def score(self, X, y): """ Get the score given X as features values and y as target values. Useful for validation/testing purposes. :param X: Features dataframe of trainset :param y: Target dataframe of trainset :return: score """ return self.votingClassifier.score(X, y) def predict(self, X): """ Perform a prediction considering the models as an ensemble. NOTE! train_models() must be called before getting the predictions :param X: input values to be used for predictions :return: list of predictions with model name associated """ return self.votingClassifier.predict(X) def _get_single_oob(self, X, y, model): return model.name, model.score(X, y) def models_oob_score(self, X, y): ''' Computes the OOB score for each model in the ensemble :return: list of OOB scores, one for each model in the ensemble ''' oob_scores = [] if self.parallel: pool = multiprocessing.Pool(processes=None) f = partial(self._get_single_oob, X, y) oob_scores = pool.map(f, self.models) pool.close() pool.join() else: for model in self.models: oob_scores.append((self._get_single_oob(X, y, model))) return oob_scores def _ret_accuracy(self, array): return array[1] def best_model(self, X, y): ''' Find the best model comparing performances over OOB set :return: the model with the best OOB score ''' performances = self.models_oob_score(X, y) performances.sort(key=self._ret_accuracy, reverse=False) return performances.pop() '''def _generate_bootstrap_sample(self, X): df_boot = X.sample(n=X.shape[0], replace=True, random_state=randint(0, 10000)) oob = pd.concat([df_boot, X]).drop_duplicates(keep=False) if self.verbose is True: print("OOB set size: %.2f" % float(oob.shape[0] / df_boot.shape[0] * 100), "%") print("OOB set abs.: %i" % oob.shape[0]) return df_boot, oob''' def _generate_indexes(self, num_models, n_samples): rand_state = randint(0, num_models) sampled_idxs = self._generate_sample_indices(rand_state, n_samples) unsampled_idxs = self._generate_unsampled_indices(rand_state, n_samples) return sampled_idxs, unsampled_idxs def _generate_unsampled_indices(self, random_state, n_samples): sample_indices = self._generate_sample_indices(random_state, n_samples) sample_counts = np.bincount(sample_indices, minlength=n_samples) unsampled_mask = sample_counts == 0 indices_range = np.arange(n_samples) unsampled_indices = indices_range[unsampled_mask] return unsampled_indices def _generate_sample_indices(self, random_state, n_samples): random_instance = self._check_random_state(random_state) sample_indices = random_instance.randint(0, n_samples, n_samples) return sample_indices def _check_random_state(self, seed): if isinstance(seed, numbers.Integral): return np.random.RandomState(seed) if isinstance(seed, np.random.RandomState): return seed raise ValueError('%r cannot be used to seed a numpy.random.RandomState' ' instance' % seed)
('encoder', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(remainder='drop', transformers=[('numerical', num_pipe_4, num_feat), ('categorical', cat_pipe, categorical_feat)]) model_4 = Pipeline([('preprocessor', preprocessor), ('classifier', AdaBoostClassifier())]) model_7 = Pipeline([('preprocessor', preprocessor), ('pca', TruncatedSVD(n_components=5)), ('classifier', XGBClassifier(objective='multi:softmax', booster='gbtree', nrounds='min.error.idx', num_class=4, maximize=False, eval_metric='merror', eta=.2, max_depth=14, colsample_bytree=.4))]) model_3 = Pipeline([('preprocessor', preprocessor), ('classifier', GradientBoostingClassifier())]) eclf = EnsembleVoteClassifier(clfs=[model_4, model_7, model_3], voting='hard') eclf.fit(X_train, y_train) eclf_score = eclf.score(X_train, y_train) make_submission(eclf, X_test)
weights=[2, 1, 1], voting='soft') # In[85]: clf1.fit(X, y) clf2.fit(X, y) clf3.fit(X, y) eclf.fit(X, y) # In[86]: print(clf1.score(X, y)) print(clf2.score(X, y)) print(clf3.score(X, y)) print(eclf.score(X, y)) # Select the Features for Decision Boundaries Plot # In[87]: # select only 2 features in dataset for plot boundaries X_array = X.iloc[:, 0:2] # In[88]: y_ = pd.DataFrame.as_matrix(y) X_ = pd.DataFrame.as_matrix(X_array) X_.shape y_.shape