def main(): cal_housing = fetch_california_housing() # split 80/20 train-test X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=1) names = cal_housing.feature_names print('_' * 80) print("Training GBRT...") clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X_train, y_train) print("done.") print('_' * 80) print('Convenience plot with ``partial_dependence_plots``') print features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle( 'Partial dependence of house value on nonlocation features for the California housing dataset' ) plt.subplots_adjust(top=0.9) print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = plt.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle( 'Partial dependence of house value on median age and average occupancy' ) plt.subplots_adjust(top=0.9) plt.show()
def test(): # data cal_housing = fetch_california_housing() X = cal_housing['data'] Y = np.reshape(cal_housing['target'], (-1, 1)) # train models iters = 100 name = ["0", "1", "2", "3", "4"] model = [ SCNN(8, 1, 0, update=Update.Rprop()), SCNN(8, 1, 1, update=Update.Rprop()), SCNN(8, 1, 2, update=Update.Rprop()), SCNN(8, 1, 3, update=Update.Rprop()), SCNN(8, 1, 4, update=Update.Rprop()) ] error = np.zeros((len(model), iters)) for i in range(iters): for m in range(len(model)): error[m, i] = model[m].partial_fit(X, Y) print(i + 1, "complete") # plot results plt.figure() plt.title('Error Curves') for m in range(len(model)): plt.semilogy(error[m], label=name[m]) plt.legend() plt.show()
def test_abs_loss(): X, y = california_housing.fetch_california_housing(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) regressor = GradientBooster(n_iter=100, loss_function=AbsoluteLoss()) regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) print(f"Test Mean absolute error: {mean_absolute_error(y_pred, y_test):.4f}")
def main(): # fetch California housing dataset try: cal_housing = fetch_california_housing() except HTTPError: print("Failed downloading california housing data.") return # split 80/20 train-test X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=1) names = cal_housing.feature_names print('_' * 80) print("Training GBRT...") clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X_train, y_train) print("done.") print('_' * 80) print('Convenience plot with ``partial_dependence_plots``') print features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle('Partial dependence of house value on nonlocation features\n' 'for the California housing dataset') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = plt.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median age and ' 'average occupancy') plt.subplots_adjust(top=0.9) plt.show()
def generate_house_data(): from sklearn.datasets.california_housing import fetch_california_housing from sklearn.preprocessing import StandardScaler houses = fetch_california_housing() scaler = StandardScaler() scaled_data = scaler.fit_transform(houses.data) X_train, X_val, Y_train, Y_val = train_test_split(scaled_data, houses.target) print('Train data shape', X_train.shape) print('Validation data shape', X_val.shape) return X_train, X_val, Y_train, Y_val
def gbm_monotone_smoke_test(): cal_housing = fetch_california_housing() data = h2o.H2OFrame(cal_housing.data, column_names=cal_housing.feature_names) data["target"] = h2o.H2OFrame(cal_housing.target) train, test = data.split_frame([0.6], seed=123) feature_names = ['MedInc', 'AveOccup', 'HouseAge'] monotone_constraints = {"MedInc": 1, "AveOccup": -1, "HouseAge": 1} gbm_mono = H2OGradientBoostingEstimator(monotone_constraints=monotone_constraints, seed=42) gbm_mono.train(x=feature_names, y="target", training_frame=train, validation_frame=test)
def gbm_monotone_smoke_test(): cal_housing = fetch_california_housing() data = h2o.H2OFrame(cal_housing.data, column_names=cal_housing.feature_names) data["target"] = h2o.H2OFrame(cal_housing.target) train, test = data.split_frame([0.6], seed=123) feature_names = ['MedInc', 'AveOccup', 'HouseAge'] monotone_constraints = {"MedInc": 1, "AveOccup": -1, "HouseAge": 1} gbm_mono = H2OGradientBoostingEstimator( monotone_constraints=monotone_constraints, seed=42) gbm_mono.train(x=feature_names, y="target", training_frame=train, validation_frame=test)
def test_weighted_nurturing(): # X1 = np.arange(0, 10, 0.1) # X2 = np.arange(10, 20, 0.1) # y_train = np.sin(X1).ravel() + np.cos(X2).ravel() # X_train = pd.DataFrame(np.array([X1, X2]).T, columns=['x1', 'x2']) # X_test, y_test = X_train, y_train cal_housing = fetch_california_housing() X_train, X_test, y_train, y_test = \ train_test_split(cal_housing.data, cal_housing.target, test_size=0.3, random_state=5) X_train = pd.DataFrame(X_train, columns=cal_housing.feature_names) X_test = pd.DataFrame(X_test, columns=cal_housing.feature_names) model_params = { 'max_depth': 10, 'n_estimators': 200, 'random_state': 5, 'n_jobs': -1, 'bootstrap': True, } with StopWatch('weighted nurturing'): nurtured_ensemble = weighted_nurturing( RandomForestRegressor, X_train, y_train, feature_names=X_train.columns, n_iterations=20, metric_function='mse', n_tunes=50, update_weight=0.3, model_params=model_params) print("The mean-squared error for nurtured ensemble prediction: {}".format( mean_squared_error(nurtured_ensemble.predict(X_test), y_test))) # plain_rf = RandomForestRegressor( # min_samples_leaf=100, max_depth=8, # n_estimators=1000, random_state=5, n_jobs=-1) plain_params = {'random_state': 5, } plain_rf = RandomForestRegressor(n_estimators=3000, max_depth=8, n_jobs=-1, **plain_params) plain_rf.fit(X_train, y_train) plain_gbr = GradientBoostingRegressor( n_estimators=1000, max_depth=5, **plain_params) plain_gbr.fit(X_train, y_train) print("The mean-squared error for plain RandomForest prediction: {}".format( mean_squared_error(plain_rf.predict(X_test), y_test))) print("The mean-squared error for plain GradientBoost prediction: {}".format( mean_squared_error(plain_gbr.predict(X_test), y_test)))
run_id,experiment_id,experiment_name,run_id,metric,run_mode = create_predict_widgets(exp_name,metrics) run_id,experiment_id,experiment_name,run_id,metric,run_mode,exp_name # COMMAND ---------- dump_run_id(run_id) # COMMAND ---------- # MAGIC %md ### Read data # COMMAND ---------- from sklearn.datasets.california_housing import fetch_california_housing data = fetch_california_housing().data # COMMAND ---------- # MAGIC %md ### Review the MLflow UI # COMMAND ---------- display_run_uri(experiment_id, run_id) # COMMAND ---------- # MAGIC %md ### Predict # MAGIC # MAGIC Let's now register our Keras model as a Spark UDF to apply to rows in parallel.
from sklearn.datasets.california_housing import fetch_california_housing from sklearn.model_selection import train_test_split import pandas as pd import matplotlib.pyplot as plt from sklearn import preprocessing import numpy as np from sklearn.neighbors import BallTree ## --------- create dataframe, add .target ---------- dataset = fetch_california_housing() df = pd.DataFrame(dataset.data, columns=dataset.feature_names) df['target'] = pd.Series(dataset.target) ## --------- 2 features chosen for training --------- X = df[['Population', 'HouseAge']] print('_____________________________________') print('Dataset before train split') print(X.describe()) y = df.target ## ------------- split for train & test ------------- X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=0.2) df2 = pd.DataFrame(data=X_train, columns=['Population', 'HouseAge']) ## ---------------- feature distance ----------------
import importlib packages = [ 'pandas', 'IPython', 'statsmodels', 'sklearn', 'seaborn', 'requests', 'scipy', 'notebook' ] bad = [] for package in packages: try: importlib.import_module(package) except ImportError: bad.append("Can't import %s" % package) else: if len(bad) > 0: print('\n'.join(bad)) else: from sklearn.datasets import california_housing print("Caching california_housing") data = california_housing.fetch_california_housing() print("All good. Enjoy the tutorial!")
def main(): cal_housing = fetch_california_housing() X, y = cal_housing.data, cal_housing.target names = cal_housing.feature_names # Center target to avoid gradient boosting init bias: gradient boosting # with the 'recursion' method does not account for the initial estimator # (here the average target, by default) y -= y.mean() print("Training MLPRegressor...") est = MLPRegressor(activation='logistic') est.fit(X, y) print('Computing partial dependence plots...') # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower # with the brute method. features = [0, 5, 1, 2] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with MLPRegressor') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print("Training GradientBoostingRegressor...") est = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) est.fit(X, y) print('Computing partial dependence plots...') features = [0, 5, 1, 2, (5, 1)] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with Gradient Boosting') plt.subplots_adjust(top=0.9) print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (1, 5) pdp, axes = partial_dependence(est, X, target_feature, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median\n' 'age and average occupancy, with Gradient Boosting') plt.subplots_adjust(top=0.9) plt.show()
def ml_GradientBoostingClassifier2(self): # This example shows how to obtain partial dependence plots from a GradientBoostingRegressor trained on the California housing dataset. cal_housing = fetch_california_housing() # split 80/20 train-tests X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=1) names = cal_housing.feature_names print("Training GBRT...") clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X_train, y_train) print(" done.") print('Convenience plot with ``partial_dependence_plots``') features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle( 'Partial dependence of house value on nonlocation features\n' 'for the California housing dataset') plt.subplots_adjust( top=0.9) # tight_layout causes overlap with suptitle print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (1, 5) pdp, axes = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median\n' 'age and average occupancy') plt.subplots_adjust(top=0.9) plt.show()
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/7/31 0031 20:35 # @Author : Shulin Liu from sklearn.datasets.california_housing import fetch_california_housing from sklearn import tree import pydotplus from IPython.display import Image from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestRegressor house = fetch_california_housing() print(house.DESCR) dtr = tree.DecisionTreeRegressor(max_depth=2) # 决策树的最大深度为2 dtr.fit(house.data[:, [6, 7]], house.target) ''' 决策树可视化 ''' dot_data = tree.export_graphviz(dtr, out_file=None, feature_names=house.feature_names[6:8], filled=True, impurity=False, rounded=True) graph = pydotplus.graph_from_dot_data(dot_data) graph.get_nodes()[7].set_fillcolor("#FFF2DD") Image(graph.create_png()) graph.write_png('dtr_white_background.png') '''
def main(): cal_housing = fetch_california_housing() X, y = cal_housing.data, cal_housing.target names = cal_housing.feature_names # Center target to avoid gradient boosting init bias: gradient boosting # with the 'recursion' method does not account for the initial estimator # (here the average target, by default) y -= y.mean() print("Training SNN_Regressor...") est = SNN_Regressor(8, 1, 10, 10, hiddenAct=Activation.Tanh(), error=Error.Mse(), update=Update.RmsProp(0.001, rateDecay=0.9)) t = [ (3, lambda e: e.cool()), # cool (6, lambda e: Trainer.prune(e, X, y)), # prune # ( 18, lambda e: e.cool() ), # cool (9, lambda e: Trainer.grow(e, max(1, 1 + int(np.log(e.hiddenSize_ + 1)))) ), # grow # ( 11, lambda e: e.cool() ), # cool ] growLoss = Trainer.train(est, X, y, batch=1, maxIter=100, triggers=t) est.maxIter_ = 1000 plt.semilogy(growLoss, label='Grow') plt.legend() # plt.show() # pdb.set_trace() print("SNN weights:", est.weight_) print("SNN dweight:", est.dWeight_) print("SNN nHidden:", est.hiddenSize_) print('Computing partial dependence plots...') # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower # with the brute method. features = [0, 5, 1, 2] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with SNN_Regressor...') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print("Training MLPRegressor...") est = MLPRegressor(activation='logistic') est.fit(X, y) print('MLP Loss: ', np.average(Error.Mse().f(y, est.predict(X)))) print('Computing partial dependence plots...') # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower # with the brute method. features = [0, 5, 1, 2] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with MLPRegressor') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print("Training GradientBoostingRegressor...") est = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) est.fit(X, y) print('Computing partial dependence plots...') features = [0, 5, 1, 2, (5, 1)] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with Gradient Boosting') plt.subplots_adjust(top=0.9) print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (1, 5) pdp, axes = partial_dependence(est, X, target_feature, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median\n' 'age and average occupancy, with Gradient Boosting') plt.subplots_adjust(top=0.9) plt.show()
def setUp(self): self.dataset = fetch_california_housing()
""" print(__doc__) import numpy as np import pylab as pl from mpl_toolkits.mplot3d import Axes3D from sklearn.cross_validation import train_test_split from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble.partial_dependence import plot_partial_dependence from sklearn.ensemble.partial_dependence import partial_dependence from sklearn.datasets.california_housing import fetch_california_housing # fetch California housing dataset cal_housing = fetch_california_housing() # split 80/20 train-test X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=1) names = cal_housing.feature_names print('_' * 80) print("Training GBRT...") clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X_train, y_train) print("done.")
import importlib packages = ["pandas", "IPython", "statsmodels", "sklearn", "seaborn", "toolz", "bs4", "requests", "scipy", "tables"] bad = [] for package in packages: try: importlib.import_module(package) except ImportError: bad.append("Can't import %s" % package) else: if len(bad) > 0: print("\n".join(bad)) else: from sklearn.datasets import california_housing print("Caching california_housing") data = california_housing.fetch_california_housing() print("All good. Enjoy the tutorial!")
def __init__(self): self.data = california_housing.fetch_california_housing() self.X_train, self.X_test, self.y_train, self.y_test =\ train_test_split(self.data.data,self.data.target,random_state=42,test_size=0.2)
""" for stuff on techniques used for mnist http://yann.lecun.com/exdb/mnist/ """ from sklearn.datasets import load_digits from sklearn.datasets.california_housing import fetch_california_housing # write data into ./mldata/mnist-original.mat mnist = load_digits(return_X_y=True) housing = fetch_california_housing(data_home='.')
#!/usr/bin/env python 3.6 # -*- coding: utf-8 -*- """ # @Company :华中科技大学机械学院数控中心 # @version : V1.0 # @Author : lizhaofu # @contact : [email protected] 2018--2022 # @Time : 2019/11/27 16:27 # @File : decision_tree_regression_visualization.py # @Software: PyCharm """ from sklearn.datasets.california_housing import fetch_california_housing import pydotplus housing = fetch_california_housing() ###调用sklearn自带的数集 # print(housing.DESCR) print(housing.data.shape) print(housing.data[1]) print(len(housing.data[1])) print(housing.feature_names) print(housing.target) #####取要使用的特征做决策树 from sklearn import tree dtr = tree.DecisionTreeRegressor(max_depth=4) dtr.fit(housing.data[:, [3, 4, 5, 6, 7]], housing.target) ###取房子所在的经度和纬度 ###输出构造决策树模型的一些基本参数,有些事默认的 print(dtr)
def get_datas(self): """ 获取加利福尼亚的房屋信息 """ housing = fetch_california_housing() return housing
# -*- coding: utf-8 -*- """ Created on Thu Sep 26 09:35:56 2019 @author: Morten Sahlertz """ from sklearn.datasets.california_housing import fetch_california_housing import matplotlib.pyplot as plt import numpy as np from scipy.stats import norm cal_housing = fetch_california_housing() X, y = cal_housing.data, cal_housing.target names = cal_housing.feature_names #%% New variable names median_income = X[:, np.newaxis, 0] median_house_value = y #%% Histogram for median income fig1 = plt.figure() ax = fig1.add_subplot(111) ax.hist(median_income, bins=100) # histogram ax.set_title('Median Income Histogram') ax.set_xlabel('Median Income') ax.set_ylabel('Occurences') #%% Standard deviation, mean and median standard = np.std(median_income) print('Spredningen er: {0:.5f}'.format(standard))
def read_data(self): self.__all = fetch_california_housing() self.__train_feature, self.__test_feature, self.__train_label, self.__test_label = train_test_split( self.__all.data, self.__all.target, test_size=0.2, random_state=1)