def run_experiment(
        model_name: str, get_data: callable, compute_permutation: bool, \
        save_results: bool, model, exp_results_path):
    X, y = get_data()
    seed(7)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=VAL_RATIO)
    preprocessing_pipeline.fit(X_train)
    X_train = preprocessing_pipeline.transform(X_train)
    X_test = preprocessing_pipeline.transform(X_test)

    print("binning data")
    num_cols = get_num_cols(X_train.dtypes)
    bin_mapper = BinMapper(max_bins=256, random_state=42)
    X_train.loc[:, num_cols] = bin_mapper.fit_transform(
        X_train.loc[:, num_cols].values)
    X_test.loc[:, num_cols] = bin_mapper.transform(X_test.loc[:,
                                                              num_cols].values)

    original_dtypes = X_train.dtypes
    model.fit(X_train, y_train)
    test_prediction = model.predict(X_test)
    if compute_permutation:
        permutation_train = model.compute_fi_permutation(X_train,
                                                         y_train).to_dict()
        permutation_test = model.compute_fi_permutation(X_test,
                                                        y_test).to_dict()
    else:
        empty_dict = Series({col: nan for col in original_dtypes})
        permutation_train = empty_dict
        permutation_test = empty_dict

    is_classification = len(unique(y)) == 2
    if is_classification:
        df = DataFrame()
        df['p'] = model.predict(X_test)
        df['y'] = y_test
        df = df[df.p.notna()]
        logloss = log_loss(df['y'], df['p'])
    else:
        logloss = nan

    fi = Series(model.compute_feature_importance(method='gain'))
    fi = normalize_series(fi).to_dict()
    results = dict(model=f"{model_name}",
                   ntrees=len(model.trees),
                   leaves=[tree.n_leaves for tree in model.trees],
                   nleaves=sum([tree.n_leaves for tree in model.trees]),
                   logloss=logloss,
                   gain=fi)

    # if save_results:
    #     DataFrame(Series(results)).T.to_csv(exp_results_path)
    print(logloss)
def bin_numeric_features(x_train, x_test, contains_num_features):
    if contains_num_features:
        print("binning data")
        num_cols = get_num_cols(x_train.dtypes)
        bin_mapper = BinMapper(max_bins=256, random_state=42)
        x_train.loc[:, num_cols] = bin_mapper.fit_transform(
            x_train.loc[:, num_cols].values)
        x_test.loc[:, num_cols] = bin_mapper.transform(
            x_test.loc[:, num_cols].values)
        return x_train, x_test
    return x_train, x_test
        y = a * (X['X1'] > 0.5) + (10 - a) * X['X2'].isin(list(range(k1 // 2))) + random.random(
            nrows) * sigma

    return X, y


if __name__ == '__main__':
    MULTIPLE_EXPERIMENTS = True
    KFOLD = False
    ONE_HOT = False
    COMPUTE_PERMUTATION = True
    RESULTS_DIR = Path("k_50_sigma_5_x1_num_for_paper_k_200_sigma_20_nrows100/")

    REGRESSION = True
    x, y = get_x_y(2)
    contains_num_features = len(get_num_cols(x.dtypes)) > 0
    pp = get_preprocessing_pipeline if contains_num_features else get_preprocessing_pipeline_only_cat
    predictors = GBM_REGRESSORS if REGRESSION else GBM_CLASSIFIERS

    config = Config(
        multiple_experimens=MULTIPLE_EXPERIMENTS,
        n_experiments=10,#100
        kfold_flag=KFOLD,
        compute_permutation=COMPUTE_PERMUTATION,
        save_results=True,
        one_hot=ONE_HOT,
        contains_num_features=contains_num_features,
        seed=SEED,
        kfolds=KFOLDS,
        predictors=predictors,
        columns_to_remove=[],
Exemplo n.º 4
0
    np.random.seed(3)
    if FAST:
        model = FastCartGradientBoostingRegressorKfold if KFOLD else FastCartGradientBoostingRegressor
    else:
        model = CartGradientBoostingRegressorKfold if KFOLD else CartGradientBoostingRegressor
    reg = model(max_depth=3)

    start = time.time()
    X, y = get_x_y_boston()
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42)

    if FAST:
        num_cols = get_num_cols(X.dtypes)
        bin_mapper = BinMapper(max_bins=256, random_state=42)
        X_train.loc[:, num_cols] = bin_mapper.fit_transform(
            X_train.loc[:, num_cols].values)
        X_test.loc[:, num_cols] = bin_mapper.transform(
            X_test.loc[:, num_cols].values)

    reg.fit(X_train, y_train)
    end = time.time()
    print(end - start)
    start = time.time()
    print(f"mse is {mean_squared_error(y_test, reg.predict(X_test))}")
    end = time.time()
    print(end - start)
    tree_vis = TreeVisualizer()
    tree_vis.plot(reg.trees[0])