#from support import load_bulldozer import support from stratx.partdep import plot_stratpd, plot_catstratpd, plot_catstratpd_gridsearch from stratx.featimp import importances import numpy as np import pandas as pd import matplotlib.pyplot as plt np.set_printoptions(precision=2, suppress=True, linewidth=300, threshold=2000) #np.random.seed(1) # n = 25_000 # X, y = support.load_bulldozer(n) X, y, X_train, X_test, y_train, y_test = support.load_dataset("bulldozer", "SalePrice") X['auctioneerID'] = X['auctioneerID'].astype(np.int64) # df = pd.read_csv("../../pd/bulldozer20k.csv") # X = df.drop('SalePrice', axis=1) # y = df['SalePrice'] I = importances(#X, y, X_train, y_train, n_trials=1, normalize=False, drop_high_stddev=2.0, min_samples_leaf=20, cat_min_samples_leaf=20,
from sklearn.utils import resample from sklearn.model_selection import train_test_split from timeit import default_timer as timer from sklearn.preprocessing import StandardScaler from sklearn.model_selection import GridSearchCV import xgboost as xgb from sklearn import svm np.random.seed(1) # choose a seed that demonstrates diff RF/GBM importances # boston = load_boston() # X = pd.DataFrame(boston.data, columns=boston.feature_names) # y = pd.Series(boston.target) X, y, X_train, X_test, y_train, y_test = load_dataset("boston", "MEDV") n = X.shape[0] n_shap = len(X_test) # test all fig, axes = plt.subplots(1, 4, figsize=(10, 2.5)) lm = LinearRegression() X_train_ = StandardScaler().fit_transform(X_train) X_train_ = pd.DataFrame(X_train_, columns=X.columns) X_test_ = StandardScaler().fit_transform(X_test) X_test_ = pd.DataFrame(X_test_, columns=X.columns) lm.fit(X_train_, y_train) lm_score = lm.score(X_test_, y_test) print("OLS validation R^2", lm_score) ols_shap_I = shap_importances(lm, X_train_, X_test_,
import support from stratx.partdep import plot_stratpd, plot_catstratpd,\ plot_catstratpd_gridsearch, plot_stratpd_gridsearch import matplotlib.pyplot as plt from stratx.featimp import importances import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor #np.random.seed(5) X, y, X_train, X_test, y_train, y_test = support.load_dataset("rent", 'price') # X, y = X[:100], y[:100] # X, y = support.load_rent(n=15_000) # print(X.shape) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # tuned_params = support.models[("rent", "RF")] # rf = RandomForestRegressor(**tuned_params, n_jobs=-1) # rf.fit(X_train, y_train) # print("R^2 test",rf.score(X_test,y_test)) I = importances( X, y, n_trials=1, normalize=False, bootstrap=True, # bootstrap=False, # subsample_size=.7,
import support from stratx import importances, plot_importances import matplotlib.pyplot as plt import numpy as np from support import synthetic_files for dataset in synthetic_files: print(f"Plot stability for {dataset}") np.random.seed(1) X, y, X_train, X_test, y_train, y_test = support.load_dataset( dataset, targetname='response') print(X.shape) I = importances(X, y, bootstrap=False, n_trials=30, subsample_size=.75, n_jobs=4) print(I) # Don't need for continuous x_i since importance == impact when # there are n unique values for n values # plot_importances(I[0:10], imp_range=(0, 0.4), sortby='Importance') #plt.savefig(f"../images/{dataset}-stability-importance.pdf", bbox_inches="tight", pad_inches=0) # plt.show() # plt.close() plot_importances(I[0:20], imp_range=(0, 0.4), sortby='Impact')