def test_custom_metric_and_scores_proba(self):

        model = LogisticRegression(random_state=0)
        scorer = make_scorer(roc_auc_score_universal, needs_proba=True)
        scores_1 = cross_val_score(model, X_train, y=y_train,
                                   cv=n_folds, scoring=scorer,
                                   n_jobs=1, verbose=0)
        
        # fit then transform
        estimators = [('logit', LogisticRegression(random_state=0))]
        stack = StackingTransformer(estimators, regression=False,
                                    n_folds=n_folds, shuffle=False,
                                    variant='B', random_state=0,
                                    stratified=True, needs_proba=True,
                                    metric=roc_auc_score_universal, verbose=0)
        stack = stack.fit(X_train, y_train)
        scores_2 = stack.scores_[0].copy()
            
        # fit_transform
        # also check refitting already fitted transformer
        _ = stack.fit_transform(X_train, y_train)
        scores_3 = stack.scores_[0].copy()
        
        assert_array_equal(scores_1, scores_2)
        assert_array_equal(scores_1, scores_3)
        
        # mean and std
        mean_1 = np.mean(scores_1)
        std_1 = np.std(scores_1)
        
        mean_2 = stack.mean_std_[0][1]
        std_2 = stack.mean_std_[0][2]
        
        assert_equal(mean_1, mean_2)
        assert_equal(std_1, std_2)
示例#2
0
    def test_custom_metric_and_scores_labels(self):

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        scorer = make_scorer(zero_one_loss)
        scores_1 = cross_val_score(model,
                                   X_train,
                                   y=y_train,
                                   cv=n_folds,
                                   scoring=scorer,
                                   n_jobs=1,
                                   verbose=0)

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    metric=zero_one_loss,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        scores_2 = stack.scores_[0].copy()

        # fit_transform
        # also check refitting already fitted transformer
        _ = stack.fit_transform(X_train, y_train)
        scores_3 = stack.scores_[0].copy()

        assert_array_equal(scores_1, scores_2)
        assert_array_equal(scores_1, scores_3)

        # mean and std
        mean_1 = np.mean(scores_1)
        std_1 = np.std(scores_1)

        mean_2 = stack.mean_std_[0][1]
        std_2 = stack.mean_std_[0][2]

        assert_equal(mean_1, mean_2)
        assert_equal(std_1, std_2)
示例#3
0
def prepare_stacking_pipe(l1_estim, variantAB, metric_score, nfold):
    """preparing the stcking pipe model for classification
     Meta-learner used is Logistic Regression        """
    # initialize stacking transformer
    stack_L1 = StackingTransformer(estimators=l1_estim,
                                   regression=False,
                                   variant=variantAB,
                                   needs_proba=False,
                                   metric=metric_score,
                                   n_folds=nfold,
                                   shuffle=True,
                                   stratified=True,
                                   random_state=SEED,
                                   verbose=2)

    # final meta learner model
    meta_learner = LogisticRegression(C=1,
                                      multi_class='ovr',
                                      penalty='l2',
                                      solver='liblinear',
                                      random_state=SEED)

    # creating stacking Pipeline
    stack_steps = [('stacking', stack_L1), ('meta_model', meta_learner)]

    return Pipeline(stack_steps)
示例#4
0
def objective(params, keker):
    params['max_depth'] = int(params['max_depth'])
    params['n_estimators'] = int(params['n_estimators'])

    if keker is LGBMRegressor:
        params['num_leaves'] = 2 ** params['max_depth']

    reg = keker(**params)

    estimators = [
        ('reg', reg),
    ]

    final_estimator = WeightedRegressor()

    stack = StackingTransformer(estimators=estimators, variant='A', regression=True, n_folds=3, shuffle=False,
                                random_state=None)
    steps = [('stack', stack),
             ('final_estimator', final_estimator)]
    pipe = Pipeline(steps)

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_val)
    score = mean_absolute_percentage_error(y_val, y_pred)
    logger.info(f'MAPE on valid: {score}, params: {params}')
    return score
    def test_variant_B_labels(self):
        # reference
        model = LogisticRegression(random_state=0)
        S_train_1 = cross_val_predict(model, X_train, y=y_train,
                                      cv=n_folds, n_jobs=1, verbose=0,
                                      method='predict').reshape(-1, 1)
        model = model.fit(X_train, y_train)
        S_test_1 = model.predict(X_test).reshape(-1, 1)

        # fit then transform
        estimators = [('logit', LogisticRegression(random_state=0))]
        stack = StackingTransformer(estimators, regression=False,
                                    n_folds=n_folds, shuffle=False,
                                    variant='B', random_state=0,
                                    stratified=True, verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)
            
        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)
        
        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
    def test_variant_B_default_classifier_proba(self):
        # reference
        model = DummyClassifier(strategy='constant', constant=1)
        S_train_1 = cross_val_predict(model, X_train, y=y_train,
                                      cv=n_folds, n_jobs=1, verbose=0,
                                      method='predict_proba')
        model = model.fit(X_train, y_train)
        S_test_1 = model.predict_proba(X_test)

        # fit then transform
        stack = StackingTransformer(estimators=None, regression=False,
                                    n_folds=n_folds, shuffle=False,
                                    variant='B', random_state=0,
                                    needs_proba=True, verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)
            
        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)
        
        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
    def test_variant_A_proba_shuffle_random_state(self):

        S_test_1 = np.zeros((X_test.shape[0], n_classes))
        S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = LogisticRegression(random_state=0)
            model = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1[:, class_id] = np.mean(S_test_temp[:,
                                                        class_id::n_classes],
                                            axis=1)

        model = LogisticRegression(random_state=0)
        # !!! Important. Here we pass CV-generator ``cv=kf`` not number of folds
        S_train_1 = cross_val_predict(model,
                                      X_train,
                                      y=y_train,
                                      cv=kf,
                                      n_jobs=1,
                                      verbose=0,
                                      method='predict_proba')

        # fit then transform
        estimators = [('logit', LogisticRegression(random_state=0))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=True,
                                    variant='A',
                                    random_state=0,
                                    stratified=True,
                                    needs_proba=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
示例#8
0
    def test_variant_A_labels(self):

        S_test_temp = np.zeros((X_test.shape[0], n_folds))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = LogisticRegression(random_state=0,
                                       solver='liblinear',
                                       multi_class='ovr')
            model = model.fit(X_tr, y_tr)
            S_test_temp[:, fold_counter] = model.predict(X_test)
        S_test_1 = st.mode(S_test_temp, axis=1)[0]

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1 = cross_val_predict(model,
                                      X_train,
                                      y=y_train,
                                      cv=n_folds,
                                      n_jobs=1,
                                      verbose=0,
                                      method='predict').reshape(-1, 1)

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='A',
                                    random_state=0,
                                    stratified=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
示例#9
0
    def test_variant_B_2_estimators_proba(self):

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1_e1 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')
        model = model.fit(X_train, y_train)
        S_test_1_e1 = model.predict_proba(X_test)

        model = GaussianNB()
        S_train_1_e2 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')
        model = model.fit(X_train, y_train)
        S_test_1_e2 = model.predict_proba(X_test)

        S_train_1 = np.c_[S_train_1_e1, S_train_1_e2]
        S_test_1 = np.c_[S_test_1_e1, S_test_1_e2]

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr')),
                      ('bayes', GaussianNB())]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    needs_proba=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
def main():
    logger.info('start reading...')
    df_train = pd.read_csv('data/train.csv', parse_dates=['OrderedDate'])
    df_val = pd.read_csv('data/validation.csv', parse_dates=['OrderedDate'])
    df_train = pd.concat([df_train, df_val], axis=0)
    df_test = pd.read_csv('data/test.csv', parse_dates=['OrderedDate'])

    logger.info('end reading')
    logger.info('start preprocessing...')

    X_train = preprocess(df_train)
    y_train = df_train['RTA']

    X_val = preprocess(df_val)
    y_val = df_val['RTA']

    X_test = preprocess(df_test)

    logger.info('end preprocessing.')

    estimators = [('xgb', XGBRegressor(**xgb_params)),
                  ('lgb', LGBMRegressor(**lgb_params)),
                  ('cat', CatBoostRegressor(**cat_params))]

    final_estimator = WeightedRegressor()

    stack = StackingTransformer(estimators=estimators,
                                variant='A',
                                regression=True,
                                n_folds=5,
                                shuffle=False,
                                random_state=None)
    steps = [('stack', stack), ('final_estimator', final_estimator)]
    pipe = Pipeline(steps)

    logger.info('start training...')

    pipe.fit(X_train, y_train)

    logger.info('end training.')

    y_pred = pipe.predict(X_val)
    logger.info(
        f'MAPE on valid: {mean_absolute_percentage_error(y_val, y_pred)}')

    y_test = pipe.predict(X_test)
    df_test['Prediction'] = y_test
    df_test = df_test[['Id', 'Prediction']]
    df_test.to_csv('data/submission.csv', index=None)

    logger.info('the end!')
示例#11
0
    def stack_predict(self, df, holdout, pipes, amount=2):
        X, y = self.split_x_y(df)
        X_test, y_test = self.split_x_y(holdout)

        pipe = Pipeline(self.top_pipeline(pipes).steps[:-1])
        X = pipe.fit_transform(X)
        X_test = pipe.transform(X_test)

        estimators = []

        for i in range(amount):
            estimators.append((str(i), self.top_pipeline(pipes,
                                                         i).steps[-1][1]))

        regression = False

        if self.METRIC in [
                "explained_variance",
                "neg_mean_absolute_error",
                "neg_mean_squared_error",
                "neg_mean_squared_log_error",
                "neg_median_absolute_error",
                "r2",
        ]:
            regression = True

        stack = StackingTransformer(estimators, regression)
        stack.fit(X, y)

        S_train = stack.transform(X)
        S_test = stack.transform(X_test)

        final_estimator = estimators[0][1]
        final_estimator.fit(S_train, y)

        return final_estimator, y_test, final_estimator.predict(S_test)
示例#12
0
def lvl2_xgb_vsrandomsearch(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_store = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(param_dir, 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}
    model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()})
                    for k, v in model_results.items()}

    preprocess_pipeline = pp_selector(pp_choice)

    lvl1_pipeline = [(model_name,model_object[model_name]) for model_name in model_store]

    stack = StackingTransformer(estimators=lvl1_pipeline,  # base estimators
                                regression=True,  # regression task (if you need
                                #     classification - set to False)
                                variant='A',  # oof for train set, predict test
                                #     set in each fold and find mean
                                metric=rmsle,  # metric: callable
                                n_folds=5,  # number of folds
                                shuffle=True,  # shuffle the data
                                random_state=0,  # ensure reproducibility
                                verbose=0)
    stack.fit(preprocess_pipeline.fit_transform(x_train), y_train)
    s_train = stack.transform(preprocess_pipeline.fit_transform(x_train))

    if passthrough:
        final_est = Pipeline([
            ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                    preprocess_pipeline=pp_selector(final_pp_choice),
                                                    no_of_lvl1=len(lvl1_pipeline))),
            #('debugger', DebuggerTransformer(info='final')),
            ('final_est', XGBRegressor())
        ])
        est_name = 'final_est__'
        train = np.concatenate((s_train, x_train.values), axis=1)
    else:
        final_est = XGBRegressor()
        est_name = ''
        train = s_train

    final_estimator_params = {f'{est_name}n_estimators': scipy.stats.randint(150, 1000),
                              f'{est_name}learning_rate': scipy.stats.uniform(0.01, 0.59),
                              f'{est_name}subsample': scipy.stats.uniform(0.3, 0.6),
                              f'{est_name}max_depth': scipy.stats.randint(1, 16),
                              f'{est_name}colsample_bytree': scipy.stats.uniform(0.5, 0.4),
                              f'{est_name}min_child_weight': [1, 2, 3, 4],
                              f'{est_name}gamma': scipy.stats.expon(scale=0.05),
                              }

    est = RandomizedSearchCV(final_est,
                             param_distributions=final_estimator_params,
                             cv=5,
                             n_iter=100,
                             scoring=make_scorer(rmsle, greater_is_better=False),
                             verbose=1,
                             n_jobs=-1)

    est.fit(train, y_train)
    score = {'lvl2ptvs_xgb': est.cv_results_}
    results_dir = create_results_directory(results_dir)
    with open(f'{results_dir}/results_store.pkl', 'wb') as f:
        pickle.dump(score, f)
示例#13
0
    def test_variant_A_2_estimators_proba(self):

        # Estimator 1
        S_test_1_e1 = np.zeros((X_test.shape[0], n_classes))
        S_test_temp_e1 = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = LogisticRegression(random_state=0,
                                       solver='liblinear',
                                       multi_class='ovr')
            model = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp_e1[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1_e1[:, class_id] = np.mean(
                S_test_temp_e1[:, class_id::n_classes], axis=1)

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1_e1 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')

        # Estimator 2
        S_test_1_e2 = np.zeros((X_test.shape[0], n_classes))
        S_test_temp_e2 = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = GaussianNB()
            model = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp_e2[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1_e2[:, class_id] = np.mean(
                S_test_temp_e2[:, class_id::n_classes], axis=1)

        model = GaussianNB()
        S_train_1_e2 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')

        S_train_1 = np.c_[S_train_1_e1, S_train_1_e2]
        S_test_1 = np.c_[S_test_1_e1, S_test_1_e2]

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr')),
                      ('bayes', GaussianNB())]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='A',
                                    random_state=0,
                                    stratified=True,
                                    needs_proba=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
示例#14
0
    ('lasso',Lasso(search_Lasso.best_params_,random_state=0)),
    ('svm',SVR(kernel='poly',degree=2)),
    ('kn', KNeighborsRegressor(n_jobs=-1, n_neighbors=4)),
    ('et', ExtraTreesRegressor(**search_EX.best_params_, random_state=0, n_jobs=-1)),

    ('rf', RandomForestRegressor(**search_RF.best_params_, random_state=0, n_jobs=-1)),

    ('xgb', XGBRegressor(**search_skrg.best_params_,objective='reg:squarederror',
                  tree_method= 'gpu_hist',random_state=0, n_jobs=-1))
]

stack = StackingTransformer(estimators=estimators_L1,   # base estimators
                            regression=True,            # regression task (if you need
                                                        #     classification - set to False)
                            variant='B',                # oof for train set, predict test
                                                        #     set in each fold and find mean
                            metric=mean_absolute_error, # metric: callable
                            n_folds=5,                  # number of folds
                            shuffle=False,               # shuffle the data
                            random_state=0,             # ensure reproducibility
                            verbose=1)

stack = stack.fit(X_train, y_train)
S_train = stack.transform(X_train)
S_test = stack.transform(X_test)
final_estimator = XGBRegressor(random_state=0, n_jobs=-1, learning_rate=0.4,tree_method= 'gpu_hist',
                         objective='reg:squarederror',n_estimators=300, max_depth=18)
final_estimator = final_estimator.fit(S_train, y_train)

# Predict
y_pred = final_estimator.predict(S_test)
示例#15
0
print("catboost params random chosen: ", cb_params_rand)
pipe_models_2 = [('xgb',
                  make_pipeline(ColumnSelector(cols=tuple(subset)),
                                WrapXGB(**xgb_params_rand))),
                 ('lgbm',
                  make_pipeline(ColumnSelector(cols=tuple(subset)),
                                WrapLGB(**lgb_params_rand))),
                 ('cb',
                  make_pipeline(ColumnSelector(cols=tuple(subset)),
                                WrapCB(**cb_params_rand)))]

stack1 = StackingTransformer(pipe_models_1,
                             regression=False,
                             variant='A',
                             needs_proba=True,
                             metric=auc,
                             n_folds=5,
                             stratified=True,
                             shuffle=True,
                             random_state=0,
                             verbose=1)

stack2 = StackingTransformer(pipe_models_2,
                             regression=False,
                             variant='A',
                             needs_proba=True,
                             metric=auc,
                             n_folds=5,
                             stratified=True,
                             shuffle=True,
                             random_state=0,
                             verbose=1)
示例#16
0
def final_submission(X_train, Y, X_test):
    # Define base estimators for stacking
    estimators = [('lgbm',
                   LGBMClassifier(random_state=0,
                                  n_estimators=520,
                                  learning_rate=0.1,
                                  num_leaves=31,
                                  is_unbalance=True)),
                  ('rf',
                   RandomForestClassifier(random_state=0,
                                          max_depth=10,
                                          class_weight={
                                              0: 0.2,
                                              1: 0.8
                                          },
                                          n_estimators=500,
                                          max_features=None,
                                          n_jobs=4))]
    # Perform stacking
    stack = StackingTransformer(estimators,
                                regression=False,
                                verbose=2,
                                needs_proba=True,
                                stratified=True,
                                shuffle=True)
    stack = stack.fit(X_train, Y)

    # Get the stacked features
    S_train = stack.transform(X_train)
    S_test = stack.transform(X_test)
    # Also take the weighted average of the stacked features as another feature
    S_train_av, S_test_av = np.zeros(
        (len(S_train), 2), dtype=np.float32), np.zeros((len(S_test), 2),
                                                       dtype=np.float32)
    for index, vals in enumerate(S_train):
        S_train_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3)
        S_train_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3)
    for index, vals in enumerate(S_test):
        S_test_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3)
        S_test_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3)

    # Define the final estimator
    model = XGBClassifier(random_state=0,
                          n_jobs=4,
                          max_depth=4,
                          scale_pos_weight=2.5,
                          n_estimators=200,
                          learning_rate=0.1,
                          gamma=1)
    model.fit(np.concatenate((S_train, S_train_av, X_train1), axis=1), Y)
    preds4 = model.predict_proba(
        np.concatenate((S_test, S_test_av, X_test1), axis=1))

    # Now perform random under-sampling on the data
    rus = RandomUnderSampler(random_state=0, sampling_strategy=0.3)
    X_train, Y_ = rus.fit_resample(X_train, Y)

    # Get predictions from models on this majority class under-sampled dataset
    model1 = LGBMClassifier(random_state=0,
                            n_estimators=100,
                            learning_rate=0.1,
                            num_leaves=31,
                            categorical_feature=[8, 9, 10, 11, 12, 13, 14])
    model2 = RandomForestClassifier(random_state=0,
                                    max_depth=13,
                                    n_estimators=100,
                                    max_features=None,
                                    n_jobs=4,
                                    class_weight={
                                        0: 0.4,
                                        1: 0.6
                                    })
    model1.fit(X_train, Y_), model2.fit(X_train, Y_)
    preds1, preds2 = model1.predict_proba(X_test), model2.predict_proba(X_test)

    # Get weighted average predictions
    preds3 = list()
    for a, b in zip(preds1, preds2):
        preds3.append([(0.7 * a[0]) + (0.3 * b[0]),
                       (0.7 * a[1]) + (0.3 * b[1])])

    # Finally, perform weighted average prediction of stacked ensemble and weighted average ensemble
    preds = list()
    for a, b in zip(preds3, preds4):
        preds.append([(0.5 * a[0]) + (0.5 * b[0]),
                      (0.5 * a[1]) + (0.5 * b[1])])
    preds = np.array(preds)
    preds = np.argmax(preds, axis=1)

    # Make the submission!
    fp = open("submit.csv", "w")
    fp.write("labels\n")
    for pred in preds:
        fp.write(str(pred) + "\n")
    fp.close()
示例#17
0
    def test_variant_B_verbose(self):

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1 = cross_val_predict(model,
                                      X_train,
                                      y=y_train,
                                      cv=n_folds,
                                      n_jobs=1,
                                      verbose=0,
                                      method='predict').reshape(-1, 1)
        model = model.fit(X_train, y_train)
        S_test_1 = model.predict(X_test).reshape(-1, 1)

        # verbose=0
        # fit then transform
        estimators = [('lr',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # verbose=1
        # fit then transform
        estimators = [('lr',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    verbose=1)
        stack = stack.fit(X_train, y_train)
        S_train_4 = stack.transform(X_train)
        S_test_4 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_5 = stack.fit_transform(X_train, y_train)
        S_test_5 = stack.transform(X_test)

        # verbose=2
        # fit then transform
        estimators = [('lr',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    verbose=2)
        stack = stack.fit(X_train, y_train)
        S_train_6 = stack.transform(X_train)
        S_test_6 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_7 = stack.fit_transform(X_train, y_train)
        S_test_7 = stack.transform(X_test)

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)

        assert_array_equal(S_train_1, S_train_4)
        assert_array_equal(S_test_1, S_test_4)

        assert_array_equal(S_train_1, S_train_5)
        assert_array_equal(S_test_1, S_test_5)

        assert_array_equal(S_train_1, S_train_6)
        assert_array_equal(S_test_1, S_test_6)

        assert_array_equal(S_train_1, S_train_7)
        assert_array_equal(S_test_1, S_test_7)
# Create training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.1)
X, Y = X_train, Y_train

# Perform Feature ranking with recursive feature elimination (5 features only)
selector = RFE(LinearSVC(random_state=0, class_weight='balanced', tol=1.0, max_iter=1000), 5, step=3)
X, X_test = selector.fit_transform(X, Y), selector.transform(X_test)
print(X.shape)
print(selector)
print(X_test)
# Features Selected by above step
#['Clusterin_Apo_J', 'Cystatin_C', 'FAS', 'NrCAM', 'tau']

# Perform Stacking
models = [('xgb', XGBClassifier(random_state=0)), ('svc', LinearSVC(random_state=0, class_weight='balanced', tol=1.0, max_iter=1000))]
stack = StackingTransformer(models, regression=False, verbose=0)
stack = stack.fit(X, Y)
pickle.dump(stack, open('stacker.pkl','wb'))
X, X_test = np.concatenate((X, stack.transform(X)), axis=1), np.concatenate((X_test, stack.transform(X_test)), axis=1)

# Let's test the effect of different threshold values and record it
# in order to take the best threshold valuess
threshold, final_scores = [-0.8], list()
for i in range(21):
    scores, roc_auc = list(), 0
    kfolds = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    for train, test in kfolds.split(X, Y):
        x_train, x_test = X[train], X[test]
        y_train, y_test = Y[train], Y[test]

        model = LinearSVC(random_state=0, tol=1.0, class_weight='balanced')
                           n_estimators=500)

# boosting
ada = AdaBoostClassifier(random_state=SEED, **ada_params)
gb = GradientBoostingClassifier(random_state=SEED, **gb_params)
xgboost = xgb.XGBClassifier(random_state=SEED, **xgboost_paramas)

# define estimator for stacking transformer
estimator = [('voting', voting_models), ('bag_knn', bag_knn),
             ('bag_dt', bag_dt), ('ada', ada), ('gb', gb),
             ('xgboost', xgboost)]
stack = StackingTransformer(estimator,
                            regression=False,
                            needs_proba=False,
                            variant='A',
                            metric=metrics.accuracy_score,
                            n_folds=4,
                            stratified=True,
                            shuffle=True,
                            random_state=0,
                            verbose=2)
stack = stack.fit(X_train, y_train)
filename = 'stack.sav'
pickle.dump(stack, open(filename, 'wb'))

# stacked feature
S_train = stack.transform(X_train)
S_test = stack.transform(X_test)

# Create the parameter grid
params = {
    'eta': [0.1, 0.2, 0.3, 0.4, 0.5],
示例#20
0
def cv(num_splits, X_train, Y):
    # Define the type of cross-validation
    kf, scores = StratifiedKFold(n_splits=num_splits, random_state=0), list()

    # Perform CV
    for train_index, test_index in kf.split(X_train, Y):
        # Splitting into train and test
        x_train, y_train, x_train1 = X_train[train_index], Y[
            train_index], X_train1[train_index]
        x_test, y_test, x_test1 = X_train[test_index], Y[test_index], X_train1[
            test_index]

        # Define base estimators for stacking
        estimators = [('lgbm',
                       LGBMClassifier(random_state=0,
                                      n_estimators=520,
                                      learning_rate=0.1,
                                      num_leaves=31,
                                      is_unbalance=True)),
                      ('rf',
                       RandomForestClassifier(random_state=0,
                                              max_depth=10,
                                              class_weight={
                                                  0: 0.2,
                                                  1: 0.8
                                              },
                                              n_estimators=500,
                                              max_features=None,
                                              n_jobs=4))]
        # Perform stacking
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    verbose=2,
                                    needs_proba=True,
                                    stratified=True,
                                    shuffle=True)
        stack = stack.fit(x_train, y_train)

        # Get the stacked features
        S_train = stack.transform(x_train)
        S_test = stack.transform(x_test)
        # Also take the weighted average of the stacked features as another feature
        S_train_av, S_test_av = np.zeros(
            (len(S_train), 2), dtype=np.float32), np.zeros((len(S_test), 2),
                                                           dtype=np.float32)
        for index, vals in enumerate(S_train):
            S_train_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3)
            S_train_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3)
        for index, vals in enumerate(S_test):
            S_test_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3)
            S_test_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3)

        # Define the final estimator
        model = XGBClassifier(random_state=0,
                              n_jobs=4,
                              max_depth=4,
                              scale_pos_weight=2.5,
                              n_estimators=200,
                              learning_rate=0.1,
                              gamma=1)
        model.fit(np.concatenate((S_train, S_train_av, x_train1), axis=1),
                  y_train)
        preds4 = model.predict_proba(
            np.concatenate((S_test, S_test_av, x_test1), axis=1))

        # Now perform random under-sampling on the data
        rus = RandomUnderSampler(random_state=0, sampling_strategy=0.3)
        x_train, y_train_ = rus.fit_resample(x_train, y_train)

        # Get predictions from models on this majority class under-sampled dataset
        model1 = LGBMClassifier(random_state=0,
                                n_estimators=100,
                                learning_rate=0.1,
                                num_leaves=31,
                                categorical_feature=[8, 9, 10, 11, 12, 13, 14])
        model2 = RandomForestClassifier(random_state=0,
                                        max_depth=13,
                                        n_estimators=100,
                                        max_features=None,
                                        n_jobs=4,
                                        class_weight={
                                            0: 0.4,
                                            1: 0.6
                                        })
        model1.fit(x_train, y_train_), model2.fit(x_train, y_train_)
        preds1, preds2 = model1.predict_proba(x_test), model2.predict_proba(
            x_test)

        # Get weighted average predictions
        preds3 = list()
        for a, b in zip(preds1, preds2):
            preds3.append([(0.7 * a[0]) + (0.3 * b[0]),
                           (0.7 * a[1]) + (0.3 * b[1])])

        # Finally, perform weighted average prediction of stacked ensemble and weighted average ensemble
        preds = list()
        for a, b in zip(preds3, preds4):
            preds.append([(0.5 * a[0]) + (0.5 * b[0]),
                          (0.5 * a[1]) + (0.5 * b[1])])
        preds = np.array(preds)
        preds = np.argmax(preds, axis=1)

        # Check out the score
        scores.append(f1_score(y_test, preds))
        print("Score: ", scores[-1])
    print("Average Score: ", sum(scores) / len(scores))
示例#21
0
df_train_set['is_weekend'] = np.where(df_train_set['travel_date'] >= 5, 1, 0)

# ------ model
X = df_train_set.drop(["number_of_tickets"], axis=1)
y = df_train_set.number_of_tickets  

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, shuffle=True)



from vecstack import StackingTransformer

estimators_L1 = [
    ('et', ExtraTreesRegressor(n_estimators=100, criterion="mae", random_state=1)),
    ('rf', RandomForestRegressor(n_estimators=100, criterion="mse",max_depth=10,min_samples_split=9,min_samples_leaf=1,min_weight_fraction_leaf=0,max_leaf_nodes=None,min_impurity_decrease=0.0005)),
    ('xgb', XGBRegressor(n_estimators=100, criterion="mae", max_depth=12, subsample=0.5, learning_rate=0.05, colsample_bytree=0.9))
    ]
# Stacking
stack = StackingTransformer(estimators=estimators_L1,regression=True,shuffle=True,random_state=0,verbose=2, stratified=True, n_folds=5)
stack = stack.fit(X_train, y_train)

S_train = stack.transform(X_train)
S_test = stack.transform(X_test)

# Use 2nd level estimator to get final prediction
estimator_L2 = XGBRegressor(random_state=0,n_jobs=-1,learning_rate=0.1,n_estimators=100,max_depth=3)
estimator_L2 = estimator_L2.fit(S_train, y_train)
y_pred = estimator_L2.predict(S_test)

# Final prediction score
print('Final score: [%.8f]' % mean_absolute_error(y_test, y_pred))
示例#22
0
from xgboost import XGBRegressor

from utils import xgb_params, lgb_params, cat_params, WeightedRegressor, mean_absolute_percentage_error, \
    PROCESSED_DATA

X_train, y_train, X_val, y_val, X_test, Test_ID = joblib.load(PROCESSED_DATA)

estimators = [('xgb', XGBRegressor(**xgb_params)),
              ('lgb', LGBMRegressor(**lgb_params)),
              ('cat', CatBoostRegressor(**cat_params))]

final_estimator = WeightedRegressor()

stack = StackingTransformer(estimators=estimators,
                            variant='A',
                            regression=True,
                            n_folds=5,
                            shuffle=False,
                            random_state=None)
steps = [('stack', stack), ('final_estimator', final_estimator)]
pipe = Pipeline(steps)

logger.info('start training...')

pipe.fit(X_train, y_train)

logger.info('end training.')

y_pred = pipe.predict(X_val)
logger.info(f'MAPE on valid: {mean_absolute_percentage_error(y_val, y_pred)}')

y_test = pipe.predict(X_test)