예제 #1
0
 def param_train_fn(space, train_set):
     return xgb_classification_learner(
         features=["x"],
         target="target",
         learning_rate=space["learning_rate"],
         num_estimators=space["num_estimators"])(train_set)
예제 #2
0
def test_xgb_classification_learner():
    df_train_binary = pd.DataFrame({
        'id': ["id1", "id2", "id3", "id4"],
        'x1': [10.0, 13.0, 10.0, 13.0],
        "x2": [0, 1, 1, 0],
        "w": [2, 1, 2, 0.5],
        'y': [0, 1, 0, 1]
    })

    df_test_binary = pd.DataFrame({
        'id': ["id4", "id4", "id5", "id6"],
        'x1': [12.0, 1000.0, -4.0, 0.0],
        "x2": [1, 1, 0, 1],
        "w": [1, 2, 0, 0.5],
        'y': [1, 0, 0, 1]
    })

    df_train_multinomial = pd.DataFrame({
        'id': ["id1", "id2", "id3", "id4", "id3", "id4"],
        'x1': [10.0, 13.0, 10.0, 13.0, 10.0, 13.0],
        "x2": [0, 1, 1, 0, 1, 0],
        "w": [2, 1, 2, 0.5, 2, 0.5],
        'y': [0, 1, 2, 1, 2, 0]
    })

    df_test_multinomial = pd.DataFrame({
        'id': ["id4", "id4", "id5", "id6", "id5", "id6"],
        'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0],
        "x2": [1, 1, 0, 1, 0, 1],
        "w": [1, 2, 0, 0.5, 0, 0.5],
        'y': [1, 2, 0, 1, 2, 0]
    })

    features = ["x1", "x2"]

    learner_binary = xgb_classification_learner(features=features,
                                                target="y",
                                                learning_rate=0.1,
                                                num_estimators=20,
                                                extra_params={"max_depth": 4, "seed": 42},
                                                prediction_column="prediction",
                                                weight_column="w")

    predict_fn_binary, pred_train_binary, log = learner_binary(df_train_binary)

    pred_test_binary = predict_fn_binary(df_test_binary)

    expected_col_train = df_train_binary.columns.tolist() + ["prediction"]
    expected_col_test = df_test_binary.columns.tolist() + ["prediction"]

    assert Counter(expected_col_train) == Counter(pred_train_binary.columns.tolist())
    assert Counter(expected_col_test) == Counter(pred_test_binary.columns.tolist())
    assert pred_test_binary.prediction.max() < 1
    assert pred_test_binary.prediction.min() > 0
    assert (pred_test_binary.columns == pred_train_binary.columns).all()

    # SHAP test (binary only)
    pred_shap = predict_fn_binary(df_test_binary, apply_shap=True)
    assert "shap_values" in pred_shap.columns
    assert "shap_expected_value" in pred_shap.columns
    assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)

    # test multinomial case
    learner_multinomial = xgb_classification_learner(features=features,
                                                     target="y",
                                                     learning_rate=0.1,
                                                     num_estimators=20,
                                                     extra_params={"max_depth": 2,
                                                                   "seed": 42,
                                                                   "objective": 'multi:softprob',
                                                                   "num_class": 3},
                                                     prediction_column="prediction")

    predict_fn_multinomial, pred_train_multinomial, log = learner_multinomial(df_train_multinomial)

    pred_test_multinomial = predict_fn_multinomial(df_test_multinomial)

    expected_col_train = df_train_binary.columns.tolist() + ["prediction_0", "prediction_1", "prediction_2",
                                                             "prediction"]
    expected_col_test = df_test_binary.columns.tolist() + ["prediction_0", "prediction_1", "prediction_2",
                                                           "prediction"]

    assert Counter(expected_col_train) == Counter(pred_train_multinomial.columns.tolist())
    assert Counter(expected_col_test) == Counter(pred_test_multinomial.columns.tolist())
    assert (pred_test_multinomial.columns == pred_train_multinomial.columns).all()
예제 #3
0
def xgb_octopus_classification_learner(
        train_set: pd.DataFrame,
        learning_rate_by_bin: Dict[T, float],
        num_estimators_by_bin: Dict[T, int],
        extra_params_by_bin: Dict[T, Dict[str, Any]],
        features_by_bin: Dict[T, List[str]],
        train_split_col: str,
        train_split_bins: List,
        nthread: int,
        target_column: str,
        prediction_column: str = "prediction") -> LearnerReturnType:
    """
    Octopus ensemble allows you to inject domain specific knowledge to force a split in an initial feature, instead of
    assuming the tree model will do that intelligent split on its own. It works by first defining a split on your
    dataset and then training one individual model in each separated dataset.

    Parameters
    ----------
    train_set: pd.DataFrame
        A Pandas' DataFrame with features, target columns and a splitting column that must be categorical.

    learning_rate_by_bin: dict
        A dictionary of learning rate in the XGBoost model to use in each model split. Ex: if you want to
        split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to
        specify a list of learning rates for each split::

            {
                1: 0.08,
                2: 0.08,
                ...
                12: 0.1
            }

    num_estimators_by_bin: dict
        A dictionary of number of tree estimators in the XGBoost model to use in each model split. Ex: if you want to
        split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to
        specify a list of estimators for each split::

            {
                1: 300,
                2: 250,
                ...
                12: 300
            }

    extra_params_by_bin: dict
        A dictionary of extra parameters dictionaries in the XGBoost model to use in each model split. Ex: if you want
        to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to
        specify a list of extra parameters for each split::

            {
                1: {
                    'reg_alpha': 0.0,
                    'colsample_bytree': 0.4,
                    ...
                    'colsample_bylevel': 0.8
                    }
                2: {
                    'reg_alpha': 0.1,
                    'colsample_bytree': 0.6,
                    ...
                    'colsample_bylevel': 0.4
                    }
                ...
                12: {
                    'reg_alpha': 0.0,
                    'colsample_bytree': 0.7,
                    ...
                    'colsample_bylevel': 1.0
                    }
            }

    features_by_bin: dict
        A dictionary of features to use in each model split. Ex: if you want to split your training by tenure and you
        have a tenure column with integer values [1,2,3,...,12], you have to specify a list of features for each split::

            {
                1: [feature-1, feature-2, feature-3, ...],
                2: [feature-1, feature-3, feature-5, ...],
                ...
                12: [feature-2, feature-4, feature-8, ...]
            }

    train_split_col: str
        The name of the categorical column where the model will make the splits. Ex: if you want to split your training
        by tenure, you can have a categorical column called "tenure".

    train_split_bins: list
        A list with the actual values of the categories from the `train_split_col`. Ex: if you want to split your
        training by tenure and you have a tenure column with integer values [1,2,3,...,12] you can pass this list and
        you will split your training into 12 different models.

    nthread: int
        Number of threads for the XGBoost learners.

    target_column: str
        The name of the target column.

    prediction_column: str
        The name of the column with the predictions from the model.
    """

    train_fns = {
        b: xgb_classification_learner(
            features=features_by_bin[b],
            learning_rate=learning_rate_by_bin[b],
            num_estimators=num_estimators_by_bin[b],
            target=target_column,
            extra_params=assoc(extra_params_by_bin[b], 'nthread', nthread),
            prediction_column=prediction_column + "_bin_" + str(b))
        for b in train_split_bins
    }

    train_sets = {
        b: train_set[train_set[train_split_col] == b]
        for b in train_split_bins
    }

    train_results = {b: train_fns[b](train_sets[b]) for b in train_split_bins}

    # train_results is a 3-tuple (prediction functions, predicted train dataset, train logs)
    pred_fns = {b: train_results[b][0] for b in train_split_bins}
    train_logs = {b: train_results[b][2] for b in train_split_bins}

    def p(df: pd.DataFrame) -> pd.DataFrame:
        pred_fn = compose(*pred_fns.values())

        return (pred_fn(df).assign(
            pred_bin=prediction_column + "_bin_" +
            df[train_split_col].astype(str)).assign(
                prediction=lambda d: d.lookup(
                    d.index.values, d.pred_bin.values.squeeze())).rename(
                        index=str, columns={
                            "prediction": prediction_column
                        }).drop("pred_bin", axis=1))

    p.__doc__ = learner_pred_fn_docstring("xgb_octopus_classification_learner")

    log = {
        'xgb_octopus_classification_learner': {
            'features': features_by_bin,
            'target': target_column,
            'prediction_column': prediction_column,
            'package': "xgboost",
            'train_logs': train_logs,
            'parameters': extra_params_by_bin,
            'training_samples': len(train_set)
        }
    }

    return p, p(train_set), log