def test_custom_metric_and_scores_proba(self): model = LogisticRegression(random_state=0) scorer = make_scorer(roc_auc_score_universal, needs_proba=True) scores_1 = cross_val_score(model, X_train, y=y_train, cv=n_folds, scoring=scorer, n_jobs=1, verbose=0) # fit then transform estimators = [('logit', LogisticRegression(random_state=0))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, needs_proba=True, metric=roc_auc_score_universal, verbose=0) stack = stack.fit(X_train, y_train) scores_2 = stack.scores_[0].copy() # fit_transform # also check refitting already fitted transformer _ = stack.fit_transform(X_train, y_train) scores_3 = stack.scores_[0].copy() assert_array_equal(scores_1, scores_2) assert_array_equal(scores_1, scores_3) # mean and std mean_1 = np.mean(scores_1) std_1 = np.std(scores_1) mean_2 = stack.mean_std_[0][1] std_2 = stack.mean_std_[0][2] assert_equal(mean_1, mean_2) assert_equal(std_1, std_2)
def test_custom_metric_and_scores_labels(self): model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') scorer = make_scorer(zero_one_loss) scores_1 = cross_val_score(model, X_train, y=y_train, cv=n_folds, scoring=scorer, n_jobs=1, verbose=0) # fit then transform estimators = [('logit', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, metric=zero_one_loss, verbose=0) stack = stack.fit(X_train, y_train) scores_2 = stack.scores_[0].copy() # fit_transform # also check refitting already fitted transformer _ = stack.fit_transform(X_train, y_train) scores_3 = stack.scores_[0].copy() assert_array_equal(scores_1, scores_2) assert_array_equal(scores_1, scores_3) # mean and std mean_1 = np.mean(scores_1) std_1 = np.std(scores_1) mean_2 = stack.mean_std_[0][1] std_2 = stack.mean_std_[0][2] assert_equal(mean_1, mean_2) assert_equal(std_1, std_2)
def prepare_stacking_pipe(l1_estim, variantAB, metric_score, nfold): """preparing the stcking pipe model for classification Meta-learner used is Logistic Regression """ # initialize stacking transformer stack_L1 = StackingTransformer(estimators=l1_estim, regression=False, variant=variantAB, needs_proba=False, metric=metric_score, n_folds=nfold, shuffle=True, stratified=True, random_state=SEED, verbose=2) # final meta learner model meta_learner = LogisticRegression(C=1, multi_class='ovr', penalty='l2', solver='liblinear', random_state=SEED) # creating stacking Pipeline stack_steps = [('stacking', stack_L1), ('meta_model', meta_learner)] return Pipeline(stack_steps)
def objective(params, keker): params['max_depth'] = int(params['max_depth']) params['n_estimators'] = int(params['n_estimators']) if keker is LGBMRegressor: params['num_leaves'] = 2 ** params['max_depth'] reg = keker(**params) estimators = [ ('reg', reg), ] final_estimator = WeightedRegressor() stack = StackingTransformer(estimators=estimators, variant='A', regression=True, n_folds=3, shuffle=False, random_state=None) steps = [('stack', stack), ('final_estimator', final_estimator)] pipe = Pipeline(steps) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_val) score = mean_absolute_percentage_error(y_val, y_pred) logger.info(f'MAPE on valid: {score}, params: {params}') return score
def test_variant_B_labels(self): # reference model = LogisticRegression(random_state=0) S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict').reshape(-1, 1) model = model.fit(X_train, y_train) S_test_1 = model.predict(X_test).reshape(-1, 1) # fit then transform estimators = [('logit', LogisticRegression(random_state=0))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # compare assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_variant_B_default_classifier_proba(self): # reference model = DummyClassifier(strategy='constant', constant=1) S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') model = model.fit(X_train, y_train) S_test_1 = model.predict_proba(X_test) # fit then transform stack = StackingTransformer(estimators=None, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, needs_proba=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # compare assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_variant_A_proba_shuffle_random_state(self): S_test_1 = np.zeros((X_test.shape[0], n_classes)) S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] # X_te = X_train[te_index] # y_te = y_train[te_index] model = LogisticRegression(random_state=0) model = model.fit(X_tr, y_tr) col_slice_fold = slice(fold_counter * n_classes, fold_counter * n_classes + n_classes) S_test_temp[:, col_slice_fold] = model.predict_proba(X_test) for class_id in range(n_classes): S_test_1[:, class_id] = np.mean(S_test_temp[:, class_id::n_classes], axis=1) model = LogisticRegression(random_state=0) # !!! Important. Here we pass CV-generator ``cv=kf`` not number of folds S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=kf, n_jobs=1, verbose=0, method='predict_proba') # fit then transform estimators = [('logit', LogisticRegression(random_state=0))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=True, variant='A', random_state=0, stratified=True, needs_proba=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # compare assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_variant_A_labels(self): S_test_temp = np.zeros((X_test.shape[0], n_folds)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] # X_te = X_train[te_index] # y_te = y_train[te_index] model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') model = model.fit(X_tr, y_tr) S_test_temp[:, fold_counter] = model.predict(X_test) S_test_1 = st.mode(S_test_temp, axis=1)[0] model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict').reshape(-1, 1) # fit then transform estimators = [('logit', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='A', random_state=0, stratified=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # compare assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_variant_B_2_estimators_proba(self): model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') S_train_1_e1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') model = model.fit(X_train, y_train) S_test_1_e1 = model.predict_proba(X_test) model = GaussianNB() S_train_1_e2 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') model = model.fit(X_train, y_train) S_test_1_e2 = model.predict_proba(X_test) S_train_1 = np.c_[S_train_1_e1, S_train_1_e2] S_test_1 = np.c_[S_test_1_e1, S_test_1_e2] # fit then transform estimators = [('logit', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr')), ('bayes', GaussianNB())] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, needs_proba=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def main(): logger.info('start reading...') df_train = pd.read_csv('data/train.csv', parse_dates=['OrderedDate']) df_val = pd.read_csv('data/validation.csv', parse_dates=['OrderedDate']) df_train = pd.concat([df_train, df_val], axis=0) df_test = pd.read_csv('data/test.csv', parse_dates=['OrderedDate']) logger.info('end reading') logger.info('start preprocessing...') X_train = preprocess(df_train) y_train = df_train['RTA'] X_val = preprocess(df_val) y_val = df_val['RTA'] X_test = preprocess(df_test) logger.info('end preprocessing.') estimators = [('xgb', XGBRegressor(**xgb_params)), ('lgb', LGBMRegressor(**lgb_params)), ('cat', CatBoostRegressor(**cat_params))] final_estimator = WeightedRegressor() stack = StackingTransformer(estimators=estimators, variant='A', regression=True, n_folds=5, shuffle=False, random_state=None) steps = [('stack', stack), ('final_estimator', final_estimator)] pipe = Pipeline(steps) logger.info('start training...') pipe.fit(X_train, y_train) logger.info('end training.') y_pred = pipe.predict(X_val) logger.info( f'MAPE on valid: {mean_absolute_percentage_error(y_val, y_pred)}') y_test = pipe.predict(X_test) df_test['Prediction'] = y_test df_test = df_test[['Id', 'Prediction']] df_test.to_csv('data/submission.csv', index=None) logger.info('the end!')
def stack_predict(self, df, holdout, pipes, amount=2): X, y = self.split_x_y(df) X_test, y_test = self.split_x_y(holdout) pipe = Pipeline(self.top_pipeline(pipes).steps[:-1]) X = pipe.fit_transform(X) X_test = pipe.transform(X_test) estimators = [] for i in range(amount): estimators.append((str(i), self.top_pipeline(pipes, i).steps[-1][1])) regression = False if self.METRIC in [ "explained_variance", "neg_mean_absolute_error", "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error", "r2", ]: regression = True stack = StackingTransformer(estimators, regression) stack.fit(X, y) S_train = stack.transform(X) S_test = stack.transform(X_test) final_estimator = estimators[0][1] final_estimator.fit(S_train, y) return final_estimator, y_test, final_estimator.predict(S_test)
def lvl2_xgb_vsrandomsearch(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None): x_train = rawdf.iloc[:, :-1] y_train = rawdf.iloc[:, -1] model_store = ['rf', 'et', 'xgb'] model_object = { 'xgb': XGBRegressor(), 'rf': RandomForestRegressor(), 'et': ExtraTreesRegressor() } with open(param_dir, 'rb') as f: model_results = pickle.load(f) model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in model_results.items()} model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()}) for k, v in model_results.items()} preprocess_pipeline = pp_selector(pp_choice) lvl1_pipeline = [(model_name,model_object[model_name]) for model_name in model_store] stack = StackingTransformer(estimators=lvl1_pipeline, # base estimators regression=True, # regression task (if you need # classification - set to False) variant='A', # oof for train set, predict test # set in each fold and find mean metric=rmsle, # metric: callable n_folds=5, # number of folds shuffle=True, # shuffle the data random_state=0, # ensure reproducibility verbose=0) stack.fit(preprocess_pipeline.fit_transform(x_train), y_train) s_train = stack.transform(preprocess_pipeline.fit_transform(x_train)) if passthrough: final_est = Pipeline([ ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(), preprocess_pipeline=pp_selector(final_pp_choice), no_of_lvl1=len(lvl1_pipeline))), #('debugger', DebuggerTransformer(info='final')), ('final_est', XGBRegressor()) ]) est_name = 'final_est__' train = np.concatenate((s_train, x_train.values), axis=1) else: final_est = XGBRegressor() est_name = '' train = s_train final_estimator_params = {f'{est_name}n_estimators': scipy.stats.randint(150, 1000), f'{est_name}learning_rate': scipy.stats.uniform(0.01, 0.59), f'{est_name}subsample': scipy.stats.uniform(0.3, 0.6), f'{est_name}max_depth': scipy.stats.randint(1, 16), f'{est_name}colsample_bytree': scipy.stats.uniform(0.5, 0.4), f'{est_name}min_child_weight': [1, 2, 3, 4], f'{est_name}gamma': scipy.stats.expon(scale=0.05), } est = RandomizedSearchCV(final_est, param_distributions=final_estimator_params, cv=5, n_iter=100, scoring=make_scorer(rmsle, greater_is_better=False), verbose=1, n_jobs=-1) est.fit(train, y_train) score = {'lvl2ptvs_xgb': est.cv_results_} results_dir = create_results_directory(results_dir) with open(f'{results_dir}/results_store.pkl', 'wb') as f: pickle.dump(score, f)
def test_variant_A_2_estimators_proba(self): # Estimator 1 S_test_1_e1 = np.zeros((X_test.shape[0], n_classes)) S_test_temp_e1 = np.zeros((X_test.shape[0], n_folds * n_classes)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] # X_te = X_train[te_index] # y_te = y_train[te_index] model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') model = model.fit(X_tr, y_tr) col_slice_fold = slice(fold_counter * n_classes, fold_counter * n_classes + n_classes) S_test_temp_e1[:, col_slice_fold] = model.predict_proba(X_test) for class_id in range(n_classes): S_test_1_e1[:, class_id] = np.mean( S_test_temp_e1[:, class_id::n_classes], axis=1) model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') S_train_1_e1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') # Estimator 2 S_test_1_e2 = np.zeros((X_test.shape[0], n_classes)) S_test_temp_e2 = np.zeros((X_test.shape[0], n_folds * n_classes)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] # X_te = X_train[te_index] # y_te = y_train[te_index] model = GaussianNB() model = model.fit(X_tr, y_tr) col_slice_fold = slice(fold_counter * n_classes, fold_counter * n_classes + n_classes) S_test_temp_e2[:, col_slice_fold] = model.predict_proba(X_test) for class_id in range(n_classes): S_test_1_e2[:, class_id] = np.mean( S_test_temp_e2[:, class_id::n_classes], axis=1) model = GaussianNB() S_train_1_e2 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') S_train_1 = np.c_[S_train_1_e1, S_train_1_e2] S_test_1 = np.c_[S_test_1_e1, S_test_1_e2] # fit then transform estimators = [('logit', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr')), ('bayes', GaussianNB())] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='A', random_state=0, stratified=True, needs_proba=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # compare assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
('lasso',Lasso(search_Lasso.best_params_,random_state=0)), ('svm',SVR(kernel='poly',degree=2)), ('kn', KNeighborsRegressor(n_jobs=-1, n_neighbors=4)), ('et', ExtraTreesRegressor(**search_EX.best_params_, random_state=0, n_jobs=-1)), ('rf', RandomForestRegressor(**search_RF.best_params_, random_state=0, n_jobs=-1)), ('xgb', XGBRegressor(**search_skrg.best_params_,objective='reg:squarederror', tree_method= 'gpu_hist',random_state=0, n_jobs=-1)) ] stack = StackingTransformer(estimators=estimators_L1, # base estimators regression=True, # regression task (if you need # classification - set to False) variant='B', # oof for train set, predict test # set in each fold and find mean metric=mean_absolute_error, # metric: callable n_folds=5, # number of folds shuffle=False, # shuffle the data random_state=0, # ensure reproducibility verbose=1) stack = stack.fit(X_train, y_train) S_train = stack.transform(X_train) S_test = stack.transform(X_test) final_estimator = XGBRegressor(random_state=0, n_jobs=-1, learning_rate=0.4,tree_method= 'gpu_hist', objective='reg:squarederror',n_estimators=300, max_depth=18) final_estimator = final_estimator.fit(S_train, y_train) # Predict y_pred = final_estimator.predict(S_test)
print("catboost params random chosen: ", cb_params_rand) pipe_models_2 = [('xgb', make_pipeline(ColumnSelector(cols=tuple(subset)), WrapXGB(**xgb_params_rand))), ('lgbm', make_pipeline(ColumnSelector(cols=tuple(subset)), WrapLGB(**lgb_params_rand))), ('cb', make_pipeline(ColumnSelector(cols=tuple(subset)), WrapCB(**cb_params_rand)))] stack1 = StackingTransformer(pipe_models_1, regression=False, variant='A', needs_proba=True, metric=auc, n_folds=5, stratified=True, shuffle=True, random_state=0, verbose=1) stack2 = StackingTransformer(pipe_models_2, regression=False, variant='A', needs_proba=True, metric=auc, n_folds=5, stratified=True, shuffle=True, random_state=0, verbose=1)
def final_submission(X_train, Y, X_test): # Define base estimators for stacking estimators = [('lgbm', LGBMClassifier(random_state=0, n_estimators=520, learning_rate=0.1, num_leaves=31, is_unbalance=True)), ('rf', RandomForestClassifier(random_state=0, max_depth=10, class_weight={ 0: 0.2, 1: 0.8 }, n_estimators=500, max_features=None, n_jobs=4))] # Perform stacking stack = StackingTransformer(estimators, regression=False, verbose=2, needs_proba=True, stratified=True, shuffle=True) stack = stack.fit(X_train, Y) # Get the stacked features S_train = stack.transform(X_train) S_test = stack.transform(X_test) # Also take the weighted average of the stacked features as another feature S_train_av, S_test_av = np.zeros( (len(S_train), 2), dtype=np.float32), np.zeros((len(S_test), 2), dtype=np.float32) for index, vals in enumerate(S_train): S_train_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3) S_train_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3) for index, vals in enumerate(S_test): S_test_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3) S_test_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3) # Define the final estimator model = XGBClassifier(random_state=0, n_jobs=4, max_depth=4, scale_pos_weight=2.5, n_estimators=200, learning_rate=0.1, gamma=1) model.fit(np.concatenate((S_train, S_train_av, X_train1), axis=1), Y) preds4 = model.predict_proba( np.concatenate((S_test, S_test_av, X_test1), axis=1)) # Now perform random under-sampling on the data rus = RandomUnderSampler(random_state=0, sampling_strategy=0.3) X_train, Y_ = rus.fit_resample(X_train, Y) # Get predictions from models on this majority class under-sampled dataset model1 = LGBMClassifier(random_state=0, n_estimators=100, learning_rate=0.1, num_leaves=31, categorical_feature=[8, 9, 10, 11, 12, 13, 14]) model2 = RandomForestClassifier(random_state=0, max_depth=13, n_estimators=100, max_features=None, n_jobs=4, class_weight={ 0: 0.4, 1: 0.6 }) model1.fit(X_train, Y_), model2.fit(X_train, Y_) preds1, preds2 = model1.predict_proba(X_test), model2.predict_proba(X_test) # Get weighted average predictions preds3 = list() for a, b in zip(preds1, preds2): preds3.append([(0.7 * a[0]) + (0.3 * b[0]), (0.7 * a[1]) + (0.3 * b[1])]) # Finally, perform weighted average prediction of stacked ensemble and weighted average ensemble preds = list() for a, b in zip(preds3, preds4): preds.append([(0.5 * a[0]) + (0.5 * b[0]), (0.5 * a[1]) + (0.5 * b[1])]) preds = np.array(preds) preds = np.argmax(preds, axis=1) # Make the submission! fp = open("submit.csv", "w") fp.write("labels\n") for pred in preds: fp.write(str(pred) + "\n") fp.close()
def test_variant_B_verbose(self): model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict').reshape(-1, 1) model = model.fit(X_train, y_train) S_test_1 = model.predict(X_test).reshape(-1, 1) # verbose=0 # fit then transform estimators = [('lr', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # verbose=1 # fit then transform estimators = [('lr', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, verbose=1) stack = stack.fit(X_train, y_train) S_train_4 = stack.transform(X_train) S_test_4 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_5 = stack.fit_transform(X_train, y_train) S_test_5 = stack.transform(X_test) # verbose=2 # fit then transform estimators = [('lr', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, verbose=2) stack = stack.fit(X_train, y_train) S_train_6 = stack.transform(X_train) S_test_6 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_7 = stack.fit_transform(X_train, y_train) S_test_7 = stack.transform(X_test) assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3) assert_array_equal(S_train_1, S_train_4) assert_array_equal(S_test_1, S_test_4) assert_array_equal(S_train_1, S_train_5) assert_array_equal(S_test_1, S_test_5) assert_array_equal(S_train_1, S_train_6) assert_array_equal(S_test_1, S_test_6) assert_array_equal(S_train_1, S_train_7) assert_array_equal(S_test_1, S_test_7)
# Create training and testing sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.1) X, Y = X_train, Y_train # Perform Feature ranking with recursive feature elimination (5 features only) selector = RFE(LinearSVC(random_state=0, class_weight='balanced', tol=1.0, max_iter=1000), 5, step=3) X, X_test = selector.fit_transform(X, Y), selector.transform(X_test) print(X.shape) print(selector) print(X_test) # Features Selected by above step #['Clusterin_Apo_J', 'Cystatin_C', 'FAS', 'NrCAM', 'tau'] # Perform Stacking models = [('xgb', XGBClassifier(random_state=0)), ('svc', LinearSVC(random_state=0, class_weight='balanced', tol=1.0, max_iter=1000))] stack = StackingTransformer(models, regression=False, verbose=0) stack = stack.fit(X, Y) pickle.dump(stack, open('stacker.pkl','wb')) X, X_test = np.concatenate((X, stack.transform(X)), axis=1), np.concatenate((X_test, stack.transform(X_test)), axis=1) # Let's test the effect of different threshold values and record it # in order to take the best threshold valuess threshold, final_scores = [-0.8], list() for i in range(21): scores, roc_auc = list(), 0 kfolds = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) for train, test in kfolds.split(X, Y): x_train, x_test = X[train], X[test] y_train, y_test = Y[train], Y[test] model = LinearSVC(random_state=0, tol=1.0, class_weight='balanced')
n_estimators=500) # boosting ada = AdaBoostClassifier(random_state=SEED, **ada_params) gb = GradientBoostingClassifier(random_state=SEED, **gb_params) xgboost = xgb.XGBClassifier(random_state=SEED, **xgboost_paramas) # define estimator for stacking transformer estimator = [('voting', voting_models), ('bag_knn', bag_knn), ('bag_dt', bag_dt), ('ada', ada), ('gb', gb), ('xgboost', xgboost)] stack = StackingTransformer(estimator, regression=False, needs_proba=False, variant='A', metric=metrics.accuracy_score, n_folds=4, stratified=True, shuffle=True, random_state=0, verbose=2) stack = stack.fit(X_train, y_train) filename = 'stack.sav' pickle.dump(stack, open(filename, 'wb')) # stacked feature S_train = stack.transform(X_train) S_test = stack.transform(X_test) # Create the parameter grid params = { 'eta': [0.1, 0.2, 0.3, 0.4, 0.5],
def cv(num_splits, X_train, Y): # Define the type of cross-validation kf, scores = StratifiedKFold(n_splits=num_splits, random_state=0), list() # Perform CV for train_index, test_index in kf.split(X_train, Y): # Splitting into train and test x_train, y_train, x_train1 = X_train[train_index], Y[ train_index], X_train1[train_index] x_test, y_test, x_test1 = X_train[test_index], Y[test_index], X_train1[ test_index] # Define base estimators for stacking estimators = [('lgbm', LGBMClassifier(random_state=0, n_estimators=520, learning_rate=0.1, num_leaves=31, is_unbalance=True)), ('rf', RandomForestClassifier(random_state=0, max_depth=10, class_weight={ 0: 0.2, 1: 0.8 }, n_estimators=500, max_features=None, n_jobs=4))] # Perform stacking stack = StackingTransformer(estimators, regression=False, verbose=2, needs_proba=True, stratified=True, shuffle=True) stack = stack.fit(x_train, y_train) # Get the stacked features S_train = stack.transform(x_train) S_test = stack.transform(x_test) # Also take the weighted average of the stacked features as another feature S_train_av, S_test_av = np.zeros( (len(S_train), 2), dtype=np.float32), np.zeros((len(S_test), 2), dtype=np.float32) for index, vals in enumerate(S_train): S_train_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3) S_train_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3) for index, vals in enumerate(S_test): S_test_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3) S_test_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3) # Define the final estimator model = XGBClassifier(random_state=0, n_jobs=4, max_depth=4, scale_pos_weight=2.5, n_estimators=200, learning_rate=0.1, gamma=1) model.fit(np.concatenate((S_train, S_train_av, x_train1), axis=1), y_train) preds4 = model.predict_proba( np.concatenate((S_test, S_test_av, x_test1), axis=1)) # Now perform random under-sampling on the data rus = RandomUnderSampler(random_state=0, sampling_strategy=0.3) x_train, y_train_ = rus.fit_resample(x_train, y_train) # Get predictions from models on this majority class under-sampled dataset model1 = LGBMClassifier(random_state=0, n_estimators=100, learning_rate=0.1, num_leaves=31, categorical_feature=[8, 9, 10, 11, 12, 13, 14]) model2 = RandomForestClassifier(random_state=0, max_depth=13, n_estimators=100, max_features=None, n_jobs=4, class_weight={ 0: 0.4, 1: 0.6 }) model1.fit(x_train, y_train_), model2.fit(x_train, y_train_) preds1, preds2 = model1.predict_proba(x_test), model2.predict_proba( x_test) # Get weighted average predictions preds3 = list() for a, b in zip(preds1, preds2): preds3.append([(0.7 * a[0]) + (0.3 * b[0]), (0.7 * a[1]) + (0.3 * b[1])]) # Finally, perform weighted average prediction of stacked ensemble and weighted average ensemble preds = list() for a, b in zip(preds3, preds4): preds.append([(0.5 * a[0]) + (0.5 * b[0]), (0.5 * a[1]) + (0.5 * b[1])]) preds = np.array(preds) preds = np.argmax(preds, axis=1) # Check out the score scores.append(f1_score(y_test, preds)) print("Score: ", scores[-1]) print("Average Score: ", sum(scores) / len(scores))
df_train_set['is_weekend'] = np.where(df_train_set['travel_date'] >= 5, 1, 0) # ------ model X = df_train_set.drop(["number_of_tickets"], axis=1) y = df_train_set.number_of_tickets X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, shuffle=True) from vecstack import StackingTransformer estimators_L1 = [ ('et', ExtraTreesRegressor(n_estimators=100, criterion="mae", random_state=1)), ('rf', RandomForestRegressor(n_estimators=100, criterion="mse",max_depth=10,min_samples_split=9,min_samples_leaf=1,min_weight_fraction_leaf=0,max_leaf_nodes=None,min_impurity_decrease=0.0005)), ('xgb', XGBRegressor(n_estimators=100, criterion="mae", max_depth=12, subsample=0.5, learning_rate=0.05, colsample_bytree=0.9)) ] # Stacking stack = StackingTransformer(estimators=estimators_L1,regression=True,shuffle=True,random_state=0,verbose=2, stratified=True, n_folds=5) stack = stack.fit(X_train, y_train) S_train = stack.transform(X_train) S_test = stack.transform(X_test) # Use 2nd level estimator to get final prediction estimator_L2 = XGBRegressor(random_state=0,n_jobs=-1,learning_rate=0.1,n_estimators=100,max_depth=3) estimator_L2 = estimator_L2.fit(S_train, y_train) y_pred = estimator_L2.predict(S_test) # Final prediction score print('Final score: [%.8f]' % mean_absolute_error(y_test, y_pred))
from xgboost import XGBRegressor from utils import xgb_params, lgb_params, cat_params, WeightedRegressor, mean_absolute_percentage_error, \ PROCESSED_DATA X_train, y_train, X_val, y_val, X_test, Test_ID = joblib.load(PROCESSED_DATA) estimators = [('xgb', XGBRegressor(**xgb_params)), ('lgb', LGBMRegressor(**lgb_params)), ('cat', CatBoostRegressor(**cat_params))] final_estimator = WeightedRegressor() stack = StackingTransformer(estimators=estimators, variant='A', regression=True, n_folds=5, shuffle=False, random_state=None) steps = [('stack', stack), ('final_estimator', final_estimator)] pipe = Pipeline(steps) logger.info('start training...') pipe.fit(X_train, y_train) logger.info('end training.') y_pred = pipe.predict(X_val) logger.info(f'MAPE on valid: {mean_absolute_percentage_error(y_val, y_pred)}') y_test = pipe.predict(X_test)