Пример #1
0
#from support import load_bulldozer
import support
from stratx.partdep import plot_stratpd, plot_catstratpd, plot_catstratpd_gridsearch
from stratx.featimp import importances

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.set_printoptions(precision=2, suppress=True, linewidth=300, threshold=2000)

#np.random.seed(1)

# n = 25_000
# X, y = support.load_bulldozer(n)
X, y, X_train, X_test, y_train, y_test = support.load_dataset("bulldozer", "SalePrice")

X['auctioneerID'] = X['auctioneerID'].astype(np.int64)

# df = pd.read_csv("../../pd/bulldozer20k.csv")
# X = df.drop('SalePrice', axis=1)
# y = df['SalePrice']


I = importances(#X, y,
                X_train, y_train,
                n_trials=1,
                normalize=False,
                drop_high_stddev=2.0,
                min_samples_leaf=20,
                cat_min_samples_leaf=20,
Пример #2
0
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from timeit import default_timer as timer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

import xgboost as xgb
from sklearn import svm

np.random.seed(1)  # choose a seed that demonstrates diff RF/GBM importances

# boston = load_boston()
# X = pd.DataFrame(boston.data, columns=boston.feature_names)
# y = pd.Series(boston.target)

X, y, X_train, X_test, y_train, y_test = load_dataset("boston", "MEDV")

n = X.shape[0]
n_shap = len(X_test)  # test all

fig, axes = plt.subplots(1, 4, figsize=(10, 2.5))

lm = LinearRegression()
X_train_ = StandardScaler().fit_transform(X_train)
X_train_ = pd.DataFrame(X_train_, columns=X.columns)
X_test_ = StandardScaler().fit_transform(X_test)
X_test_ = pd.DataFrame(X_test_, columns=X.columns)
lm.fit(X_train_, y_train)
lm_score = lm.score(X_test_, y_test)
print("OLS validation R^2", lm_score)
ols_shap_I = shap_importances(lm, X_train_, X_test_,
Пример #3
0
import support
from stratx.partdep import plot_stratpd, plot_catstratpd,\
                           plot_catstratpd_gridsearch, plot_stratpd_gridsearch
import matplotlib.pyplot as plt
from stratx.featimp import importances
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

#np.random.seed(5)

X, y, X_train, X_test, y_train, y_test = support.load_dataset("rent", 'price')

# X, y = X[:100], y[:100]

# X, y = support.load_rent(n=15_000)
# print(X.shape)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# tuned_params = support.models[("rent", "RF")]
# rf = RandomForestRegressor(**tuned_params, n_jobs=-1)
# rf.fit(X_train, y_train)
# print("R^2 test",rf.score(X_test,y_test))

I = importances(
    X,
    y,
    n_trials=1,
    normalize=False,
    bootstrap=True,
    # bootstrap=False,
    # subsample_size=.7,
Пример #4
0
import support
from stratx import importances, plot_importances
import matplotlib.pyplot as plt
import numpy as np
from support import synthetic_files

for dataset in synthetic_files:
    print(f"Plot stability for {dataset}")
    np.random.seed(1)
    X, y, X_train, X_test, y_train, y_test = support.load_dataset(
        dataset, targetname='response')

    print(X.shape)

    I = importances(X,
                    y,
                    bootstrap=False,
                    n_trials=30,
                    subsample_size=.75,
                    n_jobs=4)

    print(I)

    # Don't need for continuous x_i since importance == impact when
    # there are n unique values for n values
    # plot_importances(I[0:10], imp_range=(0, 0.4), sortby='Importance')
    #plt.savefig(f"../images/{dataset}-stability-importance.pdf", bbox_inches="tight", pad_inches=0)
    # plt.show()
    # plt.close()

    plot_importances(I[0:20], imp_range=(0, 0.4), sortby='Impact')