예제 #1
0
    def test_cross_val_meta_stack(self):
        x, y = DataGenerator.get_digits_data()

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

        xgb_initparam = ParamsGenerator.get_xgb_init_param()
        rf_initparam = ParamsGenerator.get_rf_init_param()
        ext_initparam = ParamsGenerator.get_ext_init_param()

        xgb_bestparam = CrossValStack.get_best_xgbopt(x_train, y_train, xgb_initparam)
        rf_bestparam = CrossValStack.get_best_sklopt(x_train, y_train, rf_initparam)
        ext_bestparam = CrossValStack.get_best_etopt(x_train, y_train, ext_initparam)

        res = CrossValStack.cross_val_meta_stack(x_train, y_train, x_test, xgb_bestparam, rf_bestparam, ext_bestparam,
                                                 csvstack_cv=3)
        dfres = pd.DataFrame([res[0][:, 1], res[1][:, 1], res[2][:, 1]]).transpose()
        dfres.columns = ['p1', 'p2', 'p3']

        y_test_xgb = CrossValStack.predict_opt_clf(XGBOpt.XGBOpt(x_train, y_train), xgb_bestparam, x_test, x_test)[0]
        y_test_skl = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), rf_bestparam, x_test, x_test)[0]
        y_test_ext = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), ext_bestparam, x_test, x_test)[0]

        print metrics.roc_auc_score(y_test, y_test_xgb)
        print metrics.roc_auc_score(y_test, y_test_skl)
        print metrics.roc_auc_score(y_test, y_test_ext)

        print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3)

        self.assertEqual(len(res), 3)
예제 #2
0
def stack_that(x_train, y_train, x_test, train_idx, stack_idx, rfparams,
               extparams, xgbparams):
    x_train_train = x_train.iloc[train_idx]
    y_train_train = y_train.iloc[train_idx]
    x_train_stack = x_train.iloc[stack_idx]
    y_train_stack = y_train.iloc[stack_idx]

    logging.info(" >>> DGH >>>  prediction")
    xgbopt = XGBOpt.XGBOpt(x_train_train, y_train_train)
    y_pred_stack_1, y_pred_test_1 = predict_opt_clf(xgbopt, xgbparams,
                                                    x_train_stack, x_test)

    skopt = SklearnOpt.SklearnOpt(x_train_train, y_train_train)
    y_pred_stack_2, y_pred_test_2 = predict_opt_clf(skopt, rfparams,
                                                    x_train_stack, x_test)

    skopt = SklearnOpt.SklearnOpt(x_train_train, y_train_train)
    y_pred_stack_3, y_pred_test_3 = predict_opt_clf(skopt, extparams,
                                                    x_train_stack, x_test)

    logging.info(" >>> DGH >>>  prediction  =>  stacking")
    x_pred_stack = pd.DataFrame(
        np.transpose(np.array([y_pred_stack_1, y_pred_stack_2,
                               y_pred_stack_3])))
    x_pred_test = pd.DataFrame(
        np.transpose(np.array([y_pred_test_1, y_pred_test_2, y_pred_test_3])))

    lr = LogisticRegression()
    lr.fit(x_pred_stack, y_train_stack)

    return lr.predict_proba(x_pred_test)
예제 #3
0
def meta_stack_that(x_train, y_train, x_test, train_idx, stack_idx, rfparams,
                    extparams, xgbparams):

    pca = PCA(n_components=10)
    kmeans = KMeans(n_clusters=3)

    x_train_train = x_train.iloc[train_idx]
    y_train_train = y_train.iloc[train_idx]
    x_train_stack = x_train.iloc[stack_idx]
    y_train_stack = y_train.iloc[stack_idx]

    logging.info(" >>> DGH >>> kmean-pca")
    x_train_stack_cls = kmeans.fit_predict((pca.fit_transform(x_train_stack)))
    x_test_stack_cls = kmeans.predict((pca.transform(x_test)))

    x_cls_stack = pd.get_dummies(x_train_stack_cls,
                                 prefix='cls').reset_index(drop=True)
    x_cls_test = pd.get_dummies(x_test_stack_cls,
                                prefix='cls').reset_index(drop=True)

    logging.info(" >>> DGH >>> kmean-pca  =>  prediction")
    xgbopt = XGBOpt.XGBOpt(x_train_train, y_train_train)
    y_pred_stack_1, y_pred_test_1 = predict_opt_clf(xgbopt, xgbparams,
                                                    x_train_stack, x_test)

    skopt = SklearnOpt.SklearnOpt(x_train_train, y_train_train)
    y_pred_stack_2, y_pred_test_2 = predict_opt_clf(skopt, rfparams,
                                                    x_train_stack, x_test)

    skopt = SklearnOpt.SklearnOpt(x_train_train, y_train_train)
    y_pred_stack_3, y_pred_test_3 = predict_opt_clf(skopt, extparams,
                                                    x_train_stack, x_test)

    logging.info(" >>> DGH >>> kmean-pca  =>  prediction  =>  stacking")
    x_pred_stack = pd.DataFrame(
        np.transpose(np.array([y_pred_stack_1, y_pred_stack_2,
                               y_pred_stack_3])))
    x_pred_test = pd.DataFrame(
        np.transpose(np.array([y_pred_test_1, y_pred_test_2, y_pred_test_3])))

    for col1 in x_cls_stack.columns:
        for col2 in x_pred_stack.columns:
            x_cls_stack['ms_' + str(col1) + '_' + str(
                col2)] = x_cls_stack[col1] * x_pred_stack[col2].reset_index(
                    drop=True)
            x_cls_test[
                'ms_' + str(col1) + '_' +
                str(col2)] = x_cls_test[col1] * x_pred_test[col2].reset_index(
                    drop=True)

    lr = LogisticRegression()
    lr.fit(x_cls_stack[[c for c in x_cls_stack.columns if c.startswith('ms')]],
           y_train_stack)

    return lr.predict_proba(
        x_cls_test[[c for c in x_cls_test.columns if c.startswith('ms')]])
예제 #4
0
 def test_lropt_logloss(self):
     x, y = DataGenerator.get_digits_data()
     skopt = SklearnOpt.SklearnOpt(x, y)
     param = HyperoptParam.HyperoptParam.param_space_clf_skl_lr
     param['eval_metric'] = 'logloss'
     param['type'] = 'logistic_regression'
     best = skopt.run_hp(param)
     self.assertIsNotNone(best)
     self.assertLess(skopt.score, 0.011)
예제 #5
0
 def test_etopt_logloss(self):
     x, y = DataGenerator.get_digits_data()
     skopt = SklearnOpt.SklearnOpt(x, y)
     param = HyperoptParam.HyperoptParam.param_space_reg_skl_rf
     param['eval_metric'] = 'logloss'
     param['type'] = 'extra_trees'
     best = skopt.run_hp(param)
     self.assertIsNotNone(best)
     self.assertLess(skopt.score, 0.03)
예제 #6
0
 def test_rfopt_auc(self):
     x, y = DataGenerator.get_digits_data()
     skopt = SklearnOpt.SklearnOpt(x, y)
     param = HyperoptParam.HyperoptParam.param_space_reg_skl_rf
     param['eval_metric'] = 'auc'
     param['type'] = 'random_forest'
     best = skopt.run_hp(param)
     self.assertIsNotNone(best)
     self.assertLess(skopt.score, -0.99)
예제 #7
0
    def test_random_forest(self):
        # loading
        x, y = DataGenerator.get_adult_data()

        # cleaning
        MissingValues.add_miss_val_indicator(x)

        x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

        x_train_1, x_valid_1 = Automaton.numerize(x_train, x_valid)

        sklparam = Cvs.get_best_sklopt(x_train_1, y_train, ParamsGenerator.get_rf_init_param())
        skopt = SklearnOpt.SklearnOpt(x_train_1, y_train)
        y_pred_valid, _ = Cvs.predict_opt_clf(skopt, sklparam, x_valid_1, x_valid_1)

        print 'Random Forest'
        print metrics.roc_auc_score(y_valid, y_pred_valid)
        print metrics.log_loss(y_valid, y_pred_valid)
예제 #8
0
def get_best_etopt(x, y, params):
    return SklearnOpt.SklearnOpt(x, y).run_hp(params), params