def fit(self, x, y): ''' Fit method for the MetaEstimator. Output is a fitted estimator, that can then be used for prediction. ''' # Determine if regression or classification problem, by comparing number of # unique values in output against threshold if self.method_type is None: is_above = len(np.unique(y, axis=0)) > self.cutoff_categorical self.method_type = ('classif','regr')[is_above] # Fetch the appropriate list of estimators if self.estimators is None: if self.method is not None: self.get_estim(y) else: if self.method_type == 'regr': self.estimators = linear_model.LassoCV(normalize=True) elif self.method_type == 'classif': self.estimators = ensemble.RandomForestClassifier(random_state=1) else: if self.method_type == 'regr': self.estimators = self.estimators[0] elif self.method_type == 'classif': self.estimators = self.estimators[1] # Collect information on classes in training set (needed later) if self.method_type == 'classif': self.classes = dummy.DummyClassifier().fit(x, y).classes_ # Fit according to respective ensembling method if self.method == 'stacking': if self.method_type == 'regr': self.fitted = regressor.StackingRegressor(regressors=self.estimators, meta_regressor=linear_model.LinearRegression()).fit(x, y) elif self.method_type == 'classif': self.fitted = classifier.StackingClassifier(classifiers=self.estimators, meta_classifier=linear_model.LogisticRegression(random_state = 1)).fit(x, y) elif self.method == 'multiplexing': for i in self.estimators: self.losses.append(np.mean(cross_val_score(i, x, y))) # For multiplexing, cross validation scores determine which estimator is chosen self.fitted = self.estimators[np.argmin(self.losses)].fit(x, y) else: self.fitted = self.estimators.fit(x, y) return self
titanic_all.drop(['PassengerId', 'Name', 'Cabin','Ticket','Survived'], axis=1, inplace=True) features = ['Sex', 'Embarked', 'Pclass', 'Title', 'FamilyCategory'] titanic_all = pd.get_dummies(titanic_all, columns=features) X_train = titanic_all[0:titanic_train.shape[0]] y_train = titanic_train['Survived'] #build stacked model using selected features rf1 = ensemble.RandomForestClassifier(random_state=100) ada2 = ensemble.AdaBoostClassifier(random_state=100) dtSuper = tree.DecisionTreeClassifier(random_state=100) stack_estimator = mlxClassifier.StackingClassifier(classifiers=[rf1, ada2], meta_classifier=dtSuper) #, store_train_meta_features=True) stack_grid = {'randomforestclassifier__n_estimators': [5, 10], 'adaboostclassifier__n_estimators': [10, 50], 'meta-decisiontreeclassifier__min_samples_split': [2, 3]} grid_stack_estimator = model_selection.GridSearchCV(stack_estimator, stack_grid, cv=10) grid_stack_estimator.fit(X_train, y_train) #grid_stack_estimator.fit(X_train1, y_train) final_model = grid_stack_estimator.best_estimator_ print(final_model.clfs_) #Classifiers print(final_model.meta_clf_) #Meta Classifiers X_test = titanic_all[titanic_train.shape[0]:] titanic_test['Survived'] = grid_stack_estimator.predict(X_test)
fs_model = feature_selection.SelectFromModel(rf, prefit=True) X_train1 = fs_model.transform(X_train) X_train1.shape selected_features = X_train.columns[fs_model.get_support()] #build stacked model using selected features dt1 = tree.DecisionTreeClassifier(random_state=100) knn2 = neighbors.KNeighborsClassifier() gb3 = ensemble.GradientBoostingClassifier(random_state=100) lr = linear_model.LogisticRegression(random_state=100) #use_probas=True Means: The class-probabilities of the first-level classifiers can be used to train the meta-classifier (2nd-level classifier) stack_estimator = classifier.StackingClassifier( classifiers=[dt1, knn2, gb3], meta_classifier=lr, use_probas=True) #store_train_meta_features=True, stack_grid = { 'decisiontreeclassifier__min_samples_split': [2, 3], 'kneighborsclassifier__n_neighbors': [1, 3, 5, 8], 'gradientboostingclassifier__n_estimators': [10, 50], 'meta-logisticregression__C': [0.1, 10.0] } grid_stack_estimator = model_selection.GridSearchCV(stack_estimator, stack_grid, cv=10) grid_stack_estimator.fit(X_train1, y_train) print(grid_stack_estimator.best_score_) print(grid_stack_estimator.best_params_)
cat_columns = ['Sex', 'Embarked', 'Pclass', 'Title', 'Age1', 'FamilySize1'] titanic_train1 = pd.get_dummies(titanic_train, columns=cat_columns) titanic_train1.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis=1, inplace=True) X_train = titanic_train1 y_train = titanic_train['Survived'] knn = neighbors.KNeighborsClassifier() gnb = naive_bayes.GaussianNB() rf = ensemble.RandomForestClassifier(random_state=100) lr = linear_model.LogisticRegression(random_state=100) sclf = classifier.StackingClassifier(classifiers=[knn, gnb, rf], meta_classifier=lr, store_train_meta_features=True, use_probas=True) st_grid = { 'kneighborsclassifier__n_neighbors': [3, 4, 5], 'randomforestclassifier__n_estimators': [10, 50], 'meta-logisticregression__C': [0.1, 0.5] } grid_classifier = model_selection.GridSearchCV(sclf, st_grid, cv=10, refit=True, return_train_score=True) grid_classifier.fit(X_train, y_train) results = grid_classifier.cv_results_ print(results.get('params'))
features.plot(kind='barh', figsize=(20, 20)) fs_model = feature_selection.SelectFromModel(rf, prefit=True) X_train1 = fs_model.transform(X_train) X_train1.shape selected_features = X_train.columns[fs_model.get_support()] #build stacked model using selected features rf1 = ensemble.RandomForestClassifier(random_state=100) knn2 = neighbors.KNeighborsClassifier() gb3 = ensemble.GradientBoostingClassifier(random_state=100) lr = linear_model.LogisticRegression(random_state=100) stack_estimator = classifier.StackingClassifier(classifiers=[rf1, knn2, gb3], meta_classifier=lr, store_train_meta_features=True) stack_grid = { 'kneighborsclassifier__n_neighbors': [1, 5], 'randomforestclassifier__n_estimators': [10, 50], 'gradientboostingclassifier__n_estimators': [10, 50], 'meta-logisticregression__C': [0.1, 10.0] } grid_stack_estimator = model_selection.GridSearchCV(stack_estimator, stack_grid, cv=10) grid_stack_estimator.fit(X_train1, y_train) print(grid_stack_estimator.best_score_) print(grid_stack_estimator.best_params_)
# # fs_model = feature_selection.SelectFromModel(rf, prefit=True) # X_train1 = fs_model.transform(X_train) # X_train1.shape # selected_features = X_train.columns[fs_model.get_support()] #============================================================================== #build stacked model using selected features dt1 = tree.DecisionTreeClassifier(random_state=100) rf2 = ensemble.RandomForestClassifier(random_state=100) gb3 = ensemble.GradientBoostingClassifier(random_state=100) lr = linear_model.LogisticRegression(random_state=100) stack_estimator = mlxClassifier.StackingClassifier( classifiers=[dt1, rf2, gb3], meta_classifier=lr) #, store_train_meta_features=True) stack_grid = { 'decisiontreeclassifier__min_samples_split': [2, 3], 'randomforestclassifier__n_estimators': [5, 10], 'gradientboostingclassifier__n_estimators': [10, 50], 'meta-logisticregression__C': [0.1, 10.0] } grid_stack_estimator = model_selection.GridSearchCV(stack_estimator, stack_grid, cv=10) grid_stack_estimator.fit(X_train, y_train) #grid_stack_estimator.fit(X_train1, y_train) print(grid_stack_estimator.best_score_)
kernel_svm_estimator = svm.SVC(kernel='rbf') stages = [('features', kutils.KernelTransformer('rbf')) , ('clf', linear_model.LogisticRegression()) ] lr_pipeline = pipeline.Pipeline(stages) rf_estimator = ensemble.RandomForestClassifier() gb_estimator = ensemble.GradientBoostingClassifier() stage1_estimators = [ gb_estimator, knn_estimator, rf_estimator, kernel_svm_estimator, lr_pipeline ] stage2_estimator = linear_model.LogisticRegression() stacking_estimator = mlxtnd.StackingClassifier(stage1_estimators, stage2_estimator ) stacking_grid = { 'gradientboostingclassifier__max_depth':[2], 'gradientboostingclassifier__n_estimators':list(range(300,500, 100)), 'gradientboostingclassifier__learning_rate':[0.1, 0.2, 0.5, 1.0], 'kneighborsclassifier__n_neighbors': list(range(6,10)), 'randomforestclassifier__max_depth':list(range(6,8)), 'randomforestclassifier__n_estimators':list(range(200,400, 100)), 'svc__gamma':[0.001, 0.01], 'svc__C':[0.001, 0.01, 1, 10] , 'meta_classifier__C': [0.1, 10.0] } stacking_final_estimator = cutils.grid_search_best_model(stacking_estimator, stacking_grid, X_train, y_train) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) print(titanic_test.shape) print(titanic_test.info())