def test_feature_importance_regression(): """Test that Gini importance is calculated correctly. This test follows the example from [1]_ (pg. 373). .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements of statistical learning. New York: Springer series in statistics. """ california = fetch_california_housing() X, y = california.data, california.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) reg = GradientBoostingRegressor(loss='huber', learning_rate=0.1, max_leaf_nodes=6, n_estimators=100, random_state=0) reg.fit(X_train, y_train) sorted_idx = np.argsort(reg.feature_importances_)[::-1] sorted_features = [california.feature_names[s] for s in sorted_idx] # The most important feature is the median income by far. assert sorted_features[0] == 'MedInc' # The three subsequent features are the following. Their relative ordering # might change a bit depending on the randomness of the trees and the # train / test split. assert set(sorted_features[1:4]) == {'Longitude', 'AveOccup', 'Latitude'}
def test_krige_housing(): try: housing = fetch_california_housing() except PermissionError: # This can raise permission error on Appveyor pytest.skip('Failed to load california housing dataset') # take only first 1000 p = housing['data'][:1000, :-2] x = housing['data'][:1000, -2:] target = housing['target'][:1000] p_train, p_test, y_train, y_test, x_train, x_test = \ train_test_split(p, target, x, train_size=0.7, random_state=10) for ml_model, krige_method in _methods(): reg_kr_model = RegressionKriging(regression_model=ml_model, method=krige_method, n_closest_points=2) reg_kr_model.fit(p_train, x_train, y_train) if krige_method == 'ordinary': assert reg_kr_model.score(p_test, x_test, y_test) > 0.5 else: assert reg_kr_model.score(p_test, x_test, y_test) > 0.0
def load_data_target(name): """ Loads data and target given the name of the dataset. """ if name == "Boston": data = load_boston() elif name == "Housing": data = fetch_california_housing() dataset_size = 1000 # this is necessary so that SVR does not slow down too much data["data"] = data["data"][:dataset_size] data["target"] =data["target"][:dataset_size] elif name == "digits": data = load_digits() elif name == "Climate Model Crashes": try: data = fetch_mldata("climate-model-simulation-crashes") except HTTPError as e: url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat" data = urlopen(url).read().split('\n')[1:] data = [[float(v) for v in d.split()] for d in data] samples = np.array(data) data = dict() data["data"] = samples[:, :-1] data["target"] = np.array(samples[:, -1], dtype=np.int) else: raise ValueError("dataset not supported.") return data["data"], data["target"]
def getCaliforniaHousingData(housingFILE): print("\ngetCaliforniaHousingData():") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if os.path.isfile(path = housingFILE) == False: print("downloading California housing data set from Internet ...") from sklearn.datasets import fetch_california_housing with open(file = housingFILE, mode = 'wb') as myFile: pickle.dump(obj = fetch_california_housing(), file = myFile) else: print("loading California housing data set from drive ...") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### with open(file = housingFILE, mode = 'rb') as myFile: housing = pickle.load(myFile) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### housingData = housing.data housingTarget = housing.target ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( housingData, housingTarget )
from sklearn.datasets import fetch_california_housing from sklearn.impute import SimpleImputer from sklearn.impute import IterativeImputer from sklearn.linear_model import BayesianRidge from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score N_SPLITS = 5 rng = np.random.RandomState(0) X_full, y_full = fetch_california_housing(return_X_y=True) # ~2k samples is enough for the purpose of the example. # Remove the following two lines for a slower run with different error bars. X_full = X_full[::10] y_full = y_full[::10] n_samples, n_features = X_full.shape # Estimate the score on the entire dataset, with no missing values br_estimator = BayesianRidge() score_full_data = pd.DataFrame( cross_val_score( br_estimator, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS ), columns=['Full Data'] )
# # In the previous notebook, we presented the general cross-validation framework # and how to assess if a predictive model is underfiting, overfitting, or # generalizing. Besides these aspects, it is also important to understand how # the different errors are influenced by the number of samples available. # # In this notebook, we will show this aspect by looking a the variability of # the different errors. # # Let's first load the data and create the same model as in the previous # notebook. # %% from sklearn.datasets import fetch_california_housing housing = fetch_california_housing(as_frame=True) data, target = housing.data, housing.target target *= 100 # rescale the target in k$ # %% [markdown] # ```{note} # If you want a deeper overview regarding this dataset, you can refer to the # Appendix - Datasets description section at the end of this MOOC. # ``` # %% from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor() # %% [markdown]
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. Copyright Brian Dolhansky 2014 [email protected] """ import numpy as np from data_utils import split_train_test, RMSE from linear_regression import LinearRegression from sklearn import preprocessing from sklearn.datasets import fetch_california_housing print "Loading data..." housing = fetch_california_housing(data_home='/home/bdol/data') train_data, test_data, train_target, test_target = split_train_test( housing.data, housing.target ) # Normalize the data train_data = preprocessing.scale(train_data) test_data = preprocessing.scale(test_data) # Append bias feature train_data = np.hstack((train_data, np.ones((train_data.shape[0], 1), dtype=train_data.dtype))) test_data = np.hstack((test_data, np.ones((test_data.shape[0], 1), dtype=test_data.dtype))) train_target = train_target[:, None]
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. Copyright Brian Dolhansky 2014 [email protected] """ import numpy as np from data_utils import split_train_test, RMSE from linear_regression import LinearRegression from sklearn import preprocessing from sklearn.datasets import fetch_california_housing print "Loading data..." housing = fetch_california_housing(data_home='/home/bdol/data') train_data, test_data, train_target, test_target = split_train_test( housing.data, housing.target) # Normalize the data train_data = preprocessing.scale(train_data) test_data = preprocessing.scale(test_data) # Append bias feature train_data = np.hstack( (train_data, np.ones((train_data.shape[0], 1), dtype=train_data.dtype))) test_data = np.hstack( (test_data, np.ones((test_data.shape[0], 1), dtype=test_data.dtype))) train_target = train_target[:, None] test_target = test_target[:, None]
# cross-validation could be used. # # This exercise will make you implement the same search but using the class # `GridSearchCV`. # # First, we will: # # * load the california housing dataset; # * split the data into a training and testing set; # * create a machine learning pipeline composed of a standard scaler to # normalize the data, and a ridge regression as a linear model. # %% from sklearn.datasets import fetch_california_housing X, y = fetch_california_housing(as_frame=True, return_X_y=True) X.head() # %% from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # %% from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures, StandardScaler from sklearn.linear_model import Ridge ridge = make_pipeline(StandardScaler(), Ridge()) # %% [markdown]
def test_rf_regression(datatype, split_algo, mode, column_info, max_features, rows_sample): ncols, n_info = column_info use_handle = True if mode == 'unit': X, y = make_regression(n_samples=500, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # Initialize, fit and predict using # sklearn's random forest regression model if mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_predict, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)
import tensorflow as tf import numpy as np from sklearn.datasets import fetch_california_housing # 获取房价数据集 housing = fetch_california_housing(data_home='./scikit_learn_data', download_if_missing=True) # m 行, n 列 m, n = housing.data.shape feature_names = housing.feature_names # 20640 8 # ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'] print(m, n) print(feature_names) print(housing.data[0:3], housing.target[0:3], type(housing.target[0:3])) # np.c_ 将两个矩阵拼接在一起 housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data] print(housing_data_plus_bias.shape) # 创建两个 Tensorflow 的常量节点 X 和 y , 去持有数据和标签 X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X') # X = tf.constant(housing.data, dtype=tf.float32, name='X') # 行向量转为列向量 y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y') # 使用 Tensorflow 框架提供的矩阵操作操作求 theta XT = tf.transpose(X) # 解析解一步计算出最优解 theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y) # Tensor("MatMul_2:0", shape=(9, 1), dtype=float32) print(theta)
import time if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--tabularpath", type=str, default='', nargs="?", help="Path of Tabular Data, i.e./.../data.csv") # Hyper parameters for conversion parser.add_argument("--num_quantile", type=int, default=2, nargs="?", help="q param in https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html") parser.add_argument("--min_unique_val_per_column", type=int, default=2, nargs="?", help="Apply Quantile-based discretization function on those columns having at least such " "unique values.") args = parser.parse_args() # DASK can be applied. print('Tabular data is being read') try: df = pd.read_csv(args.tabularpath) except FileNotFoundError: print('File not found, We will use california housing dataset from sklearn') from sklearn import datasets X, y = datasets.fetch_california_housing(return_X_y=True) df = pd.DataFrame(X) print('Original Tabular data: {0} by {1}'.format(*df.shape)) print('Quantisation starts') X_transformed = QCUT(min_unique_val_per_column=args.min_unique_val_per_column, num_quantile=args.num_quantile).transform(df) X_transformed.index = 'Event_' + X_transformed.index.astype(str) print('Graph data being generated') kg = GraphGenerator().transform(X_transformed)
def california_housing_dataset(batch_size, device): """ This named constructor builds a DataSet from the California Housing dataset. """ from sklearn.datasets import fetch_california_housing cal_housing = fetch_california_housing() non_outliers = np.ones(cal_housing.data.shape[0], dtype=bool) for idx in range(cal_housing.data.shape[1]): column = cal_housing.data[:, idx] cutoffs = np.percentile(column, (1.0, 99.0)) non_outliers = np.logical_and( non_outliers, np.logical_and(column > cutoffs[0], column < cutoffs[1])) # cutoffs = np.percentile(cal_housing.target, (1.0, 99.0)) # non_outliers = np.logical_and( # non_outliers, np.logical_and(cal_housing.target > cutoffs[0], # cal_housing.target < cutoffs[1]) # ) cal_housing.data = cal_housing.data[non_outliers] cal_housing.target = cal_housing.target[non_outliers] x_train, x_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=0) non_outliers = np.ones(x_test.shape[0], dtype=bool) for idx in range(x_test.shape[1]): column = x_test[:, idx] cutoffs = np.percentile(column, (5.0, 95.0)) non_outliers = np.logical_and( non_outliers, np.logical_and(column > cutoffs[0], column < cutoffs[1])) x_test = x_test[non_outliers] y_test = y_test[non_outliers] x_dimensions = cal_housing.feature_names train_size = x_train.shape[0] test_size = x_test.shape[0] y_train = y_train[..., np.newaxis] y_test = y_test[..., np.newaxis] params_desc = "train size: {}/test size: {}".format( train_size, test_size) return DataSets( x_train, y_train, x_test, y_test, x_dimensions, "Price", batch_size, "california housing dataset", params_desc, device, )
import tensorflow as tf import numpy as np from sklearn.datasets import fetch_california_housing # 立刻下载数据集 housing = fetch_california_housing(download_if_missing=True) # 获得X数据行数和列数 m, n = housing.data.shape print(m, n) print(housing.data, housing.target) print(housing.feature_names) # 这里添加一个额外的bias输入特征(x0=1)到所有的训练数据上面,因为使用的numpy所以会立即执行 housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data] # 创建两个TensorFlow常量节点X和y,去持有数据和标签 X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X') y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y') # 使用一些TensorFlow框架提供的矩阵操作去求theta XT = tf.transpose(X) # 解析解一步计算出最优解 theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y) print(theta) # with tf.Session() as sess: # theta_value = theta.eval() # sess.run(theta) # # print(theta_value)
import tensorflow as tf import numpy as np from sklearn.datasets import fetch_california_housing from sklearn.preprocessing import StandardScaler #tensorflow多元线性回归梯度下降法求解 n_epochs = 10000 learning_rate = 0.01 housing = fetch_california_housing(data_home="D:/sklearn_data", download_if_missing=True) m, n = housing.data.shape housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data] # 可以使用TensorFlow或者Numpy或者sklearn的StandardScaler去进行归一化 # StandardScaler默认就做了方差归一化,和均值归一化,这两个归一化的目的都是为了更快的进行梯度下降 # 你如何构建你的训练集,你训练除了的模型,就具备什么样的功能! scaler = StandardScaler().fit(housing_data_plus_bias) scaled_housing_data_plus_bias = scaler.transform(housing_data_plus_bias) X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X') y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y') # random_uniform函数创建图里一个节点包含随机数值,给定它的形状和取值范围,就像numpy里面rand()函数 theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name='theta') y_pred = tf.matmul(X, theta, name="predictions") error = y_pred - y mse = tf.reduce_mean(tf.square(error), name="mse") # 梯度的公式:(y_pred - y) * xj gradients = 2 / m * tf.matmul(tf.transpose(X), error) # 赋值函数对于BGD来说就是 theta_new = theta - (learning_rate * gradients) training_op = tf.assign(theta, theta - learning_rate * gradients)
import tensorflow as tf import numpy as np from sklearn.datasets import fetch_california_housing housing = fetch_california_housing(data_home="G:\ML\dataset\scikit_learn_data", download_if_missing=True) m, n = housing.data.shape print(m, n) print(housing.target.shape) print(housing.target) #print(housing.data, housing.target) print(housing.feature_names) housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data] y_pre = housing.target.reshape(-1, 1) print(y_pre.shape) X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X') Y = tf.constant(y_pre, dtype=tf.float32, name='y') XT = tf.transpose(X) theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), Y) with tf.Session() as sess: theta_value = theta.eval() print(theta_value)
from sklearn.datasets import fetch_california_housing cali = fetch_california_housing() #bunch object # print(cali.DESCR) print(cali.data.shape) print(cali.target.shape) print(cali.feature_names) import pandas as pd pd.set_option("precision", 4) pd.set_option("max_columns", 9) #display up to 9 columns in DataFrame pd.set_option("display.width", None) # auto-detect the display width cali_df = pd.DataFrame(cali.data, columns=cali.feature_names) cali_df["MedHouseValue"] = pd.Series(cali.target) print(cali_df.head()) sample_df = cali_df.sample(frac=0.1, random_state=17) import matplotlib.pyplot as plt import seaborn as sns sns.set(font_scale=2) sns.set_style("whitegrid") for feature in cali.feature_names: plt.figure(figsize=(8, 4.5)) # 8"-by-4.5" figure sns.scatterplot( data=sample_df,
def main(): assert not tf.executing_eagerly() now = datetime.utcnow().strftime('%Y%m%d%H%M%S') root_logdir = 'tf_logs' logdir = '{}/run-{}/'.format(root_logdir, now) housing = fetch_california_housing() m, n = housing.data.shape housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data] scaler = MinMaxScaler() housing_data_plus_bias_scaled = scaler.fit_transform( housing_data_plus_bias) X = housing_data_plus_bias_scaled print(X.dtype) y = housing.target.reshape(-1, 1) n_epochs = 100 learning_rate = 0.01 batch_size = 100 n_batches = int(np.ceil(m / batch_size)) grad = 'optimizer' X_batch = tf.compat.v1.placeholder(tf.float32, shape=(None, n + 1), name='X_batch') y_batch = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='y_batch') theta = tf.Variable(tf.compat.v1.random_uniform([n + 1, 1], -1.0, 1.0), name="theta") y_pred = tf.matmul(X_batch, theta, name="predictions") error = y_pred - y_batch mse = tf.reduce_mean(tf.square(error), name="mse") if grad == 'autodiff': gradients = tf.gradients(mse, [theta])[0] training_op = tf.compat.v1.assign(theta, theta - learning_rate * gradients) elif grad == 'optimizer': optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=learning_rate) training_op = optimizer.minimize(mse) else: gradients = 2 / m * tf.matmul(tf.transpose(X_batch), error) training_op = tf.compat.v1.assign(theta, theta - learning_rate * gradients) init = tf.compat.v1.global_variables_initializer() mse_summary = tf.compat.v1.summary.scalar('MSE', mse) file_writer = tf.compat.v1.summary.FileWriter( logdir, tf.compat.v1.get_default_graph()) # for saving the model #saver = tf.compat.v1.train.Saver() with tf.compat.v1.Session() as sess: sess.run(init) for epoch in range(n_epochs): #if epoch % 100 == 0: #print("Epoch: ", epoch, "MSE: ", mse.eval()) #save_path = saver.save(sess, 'model.ckpt') for batch_index in range(n_batches): X_batch_fetched, y_batch_fetched = fetch_batch( X, y, batch_index, batch_size) print('Epoch: {}, Batch: {}, MSE: {}'.format( epoch, batch_index, mse.eval(feed_dict={ X_batch: X_batch_fetched, y_batch: y_batch_fetched }))) if batch_index % 10 == 0: summary_str = mse_summary.eval(feed_dict={ X_batch: X_batch_fetched, y_batch: y_batch_fetched }) step = epoch * n_batches + batch_index file_writer.add_summary(summary_str, step) sess.run(training_op, feed_dict={ X_batch: X_batch_fetched, y_batch: y_batch_fetched }) best_theta = theta.eval()
from matplotlib import pyplot as plt from matplotlib import cm from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import minmax_scale from sklearn.preprocessing import MaxAbsScaler from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import Normalizer from sklearn.preprocessing.data import QuantileTransformer from sklearn.datasets import fetch_california_housing print(__doc__) dataset = fetch_california_housing() X_full, y_full = dataset.data, dataset.target # Take only 2 features to make visualization easier # Feature of 0 has a long tail distribution. # Feature 5 has a few but very large outliers. X = X_full[:, [0, 5]] distributions = [ ('Unscaled data', X), ('Data after standard scaling', StandardScaler().fit_transform(X)), ('Data after min-max scaling', MinMaxScaler().fit_transform(X)), ('Data after max-abs scaling',
def fetch(*args, **kwargs): return fetch_california_housing(*args, download_if_missing=False, **kwargs)
def get_dataset(dataset_name='breast_cancer'): """Retrieve one of the standard datasets in sklearn. :param dataset_name: the dataset name to use from sklearn. Valid values are `breast_cancer`, `digits`, `iris`, `wine` for classification and `boston`, `diabetes` and `california` for regression. Default is `breast_cancer`. :type dataset_name: str :return: Five variables are returned. First is the dataset itself without the target values; second includes the target values; third has all the categorical columns; fourth has all the integer columns and the last informs if it is a classification problem (True) or a regression problem (False). :rtype: np.array, np.array, np.array, np.array, Boolean """ if dataset_name == 'breast_cancer': # loading the dataset X, y = load_breast_cancer(return_X_y=True) # informing categorical columns and their available values categorical_columns = {} integer_columns = [] is_classification = True elif dataset_name == 'digits': # loading the dataset X, y = load_digits(return_X_y=True) # informing categorical columns and their available values categorical_columns = {} integer_columns = list(range(64)) is_classification = True elif dataset_name == 'iris': # loading the dataset X, y = load_iris(return_X_y=True) # informing categorical columns and their available values categorical_columns = {} integer_columns = [] is_classification = True elif dataset_name == 'wine': # loading the dataset X, y = load_wine(return_X_y=True) # informing categorical columns and their available values categorical_columns = {} integer_columns = [4, 12] is_classification = True elif dataset_name == 'boston': X, y = load_boston(return_X_y=True) # informing categorical columns and their available values categorical_columns = {3: [0, 1]} integer_columns = [8, 9] is_classification = False elif dataset_name == 'diabetes': # loading the dataset X, y = load_diabetes(return_X_y=True) # informing categorical columns and their available values categorical_columns = {1: [0.05068012, -0.04464164]} integer_columns = [] is_classification = False elif dataset_name == 'california': # loading the dataset X, y = fetch_california_housing(return_X_y=True) # informing categorical columns and their available values categorical_columns = {} integer_columns = [1, 4] is_classification = False return X, y, categorical_columns, integer_columns, is_classification
#!/usr/bin/env python # -*- coding: utf-8 -*- from sklearn.datasets import fetch_california_housing from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import train_test_split from scipy.stats import randint X, y = fetch_california_housing(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # define the parameter space that will be searched over param_distributions = { 'n_estimators': randint(1, 5), 'max_depth': randint(5, 10) } # now create a searchCV object and fit it to the data search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5, param_distributions=param_distributions, random_state=0) search.fit(X_train, y_train) print(search.best_params_) # the search object now acts like a normal random forest estimator # with max_depth=9 and n_estimators=4 print(search.score(X_test, y_test))
First, we'll load some packages. ''' # - import pandas as pd import xgboost import shap from sklearn import datasets as ds # + [markdown] ''' Next, we'll load some data. Let's use the California housing data set. ''' # + calif_house_data = ds.fetch_california_housing() print('\n') print(calif_house_data['data']) print('\n') print(calif_house_data['target']) print('\n') print(calif_house_data['feature_names']) print('\n') print(calif_house_data['DESCR']) print('\n') print('Features - # rows, # columns:', calif_house_data['data'].shape) print('\n') print('Target variable - # rows:', calif_house_data['target'].shape) # + [markdown]
from sklearn.datasets import fetch_california_housing california = fetch_california_housing() # Bunch object ''' print(california.DESCR) print(california.data.shape) print(california.target.shape) print(california.feature_names) ''' import pandas as pd pd.set_option("precision", 4) # 4 digitit precision for floats. pd.set_option("max_columns", 9) # display up to 9 columns in DataFrame outputs pd.set_option("display.width", None) #auto-detect the display width for wrapping #creates the initial DataFrame using the data in california.data and with the # column names specified based on the features of the sample california_df = pd.DataFrame(california.data, columns=california.feature_names) # add a column to the dataframe for the mdian house values stored in california.target california_df["MedHouseValue"] = pd.Series(california.target) print(california_df.head()) # using the describe method of dataframes we can get some statistical information
def load_dataset(name, num_features=5, random_state=42, flatten=False, show_corr_matrix=True, show_subplots=True): ''' Args: name (str): name of dataset ('mnist', 'fmnist', 'iris', 'breast_cancer', 'diabetes', 'housing') num_features (int): number of features to view in subplots, if None then num_features includes all features random_state (int): specify random state flatten (bool): returns image with shape (-1, 28, 28) if True, else returns image with shape (-1, 784) show_corr_matrix (bool): whether to show correlation matrix show_subplots (bool): whether to show subplots comparing the num_features Returns: (X_train, y_train): training data (numpy arrays) (X_test, y_test): testing data (20% of total data) col_names (list): feature names ''' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' if name == 'mnist': (X_train, y_train), (X_test, y_test) = mnist.load_data() if flatten: X_train = X_train.reshape(-1, 784) X_test = X_test.reshape(-1, 784) X_train = X_train / 255.0 X_test = X_test / 255.0 return (X_train, y_train), (X_test, y_test) elif name == 'fashion_mnist' or name == 'fmnist': (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data() if flatten: X_train = X_train.reshape(-1, 784) X_test = X_test.reshape(-1, 784) return (X_train, y_train), (X_test, y_test) elif name == 'iris': data = datasets.load_iris() X = data.data y = data.target col_names = data.feature_names df = pd.DataFrame(X, columns=col_names) num = num_features if num_features == None: num = len(col_names) if show_subplots: plot_subplots(df, col_names, num) if show_corr_matrix: corr_img = correlations(df, col_names) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) elif name == 'cancer' or name == 'breast_cancer': data = datasets.load_breast_cancer() X = data.data y = data.target col_names = data.feature_names df = pd.DataFrame(X, columns=col_names) num = num_features if num_features == None: num = len(col_names) if show_subplots: plot_subplots(df, col_names, num) if show_corr_matrix: corr_img = correlations(df, col_names) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) elif name == 'diabetes': data = datasets.load_diabetes() X = data.data y = data.target col_names = data.feature_names df = pd.DataFrame(X, columns=col_names) num = num_features if num_features == None: num = len(col_names) if show_subplots: plot_subplots(df, col_names, num) if show_corr_matrix: corr_img = correlations(df, col_names) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) elif name == 'house' or name == 'california_housing' or name == 'housing': data = datasets.fetch_california_housing() X = data.data y = data.target col_names = data.feature_names df = pd.DataFrame(X, columns=col_names) num = num_features if num_features == None: num = len(col_names) if show_subplots: plot_subplots(df, col_names, num) if show_corr_matrix: corr_img = correlations(df, col_names) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) if show_corr_matrix: plt.title("Correlation Matrix") return (X_train, y_train), (X_test, y_test), col_names
# regression MLP model import tensorflow as tf from sklearn import model_selection, preprocessing from sklearn import datasets # Using the sklearn california housing data housing = datasets.fetch_california_housing() X_train_full, X_test, y_train_full, y_test = model_selection.train_test_split( housing.data, housing.target) X_train, X_valid, y_train, y_valid = model_selection.train_test_split( X_train_full, y_train_full) # Standard scaling the data scaler = preprocessing.StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_valid_scaled = scaler.transform(X_valid) X_test_scaled = scaler.transform(X_test) # Creating a simple model model = tf.keras.models.Sequential([ tf.keras.layers.Dense(30, activation="selu", input_shape=X_train.shape[1:]), tf.keras.layers.Dense(10, activation="selu"), tf.keras.layers.Dense(1) ]) # Compiling using huber loss model.compile(loss=tf.keras.losses.Huber(), optimizer="adam") # training history = model.fit(X_train,
metrics = metric(torch.clamp(y_hat, min=0), torch.clamp(y[:, 1], min=0), **metric_params) else: metrics = metric(y_hat, y[:, 1], **metric_params) self.log( f"{tag}_{metric_str}", metrics, on_epoch=True, on_step=False, logger=True, prog_bar=True, ) return metrics dataset = fetch_california_housing(data_home="data", as_frame=True) dataset.frame["HouseAgeBin"] = pd.qcut(dataset.frame["HouseAge"], q=4) dataset.frame.HouseAgeBin = "age_" + dataset.frame.HouseAgeBin.cat.codes.astype( str) test_idx = dataset.frame.sample(int(0.2 * len(dataset.frame)), random_state=42).index test = dataset.frame[dataset.frame.index.isin(test_idx)] train = dataset.frame[~dataset.frame.index.isin(test_idx)] epochs = 15 batch_size = 128 steps_per_epoch = int((len(train) // batch_size) * 0.9) data_config = DataConfig( target=["HouseAgeBin"] + dataset.target_names, continuous_cols=[
import pylab as pl import numpy as np from matplotlib import pyplot as plt from sompy.sompy import SOMFactory from sklearn.datasets import fetch_california_housing from sompy.visualization.mapview import View2D from sompy.visualization.bmuhits import BmuHitsView data = fetch_california_housing() descr = data.DESCR names = fetch_california_housing().feature_names+["HouseValue"] data = np.column_stack([data.data, data.target]) print(descr) print( "FEATURES: ", ", ".join(names)) sm = SOMFactory().build(data, normalization = 'var', initialization='random', component_names=names) sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print ("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) view2D = View2D(10, 10, 'rand data', text_size=10) view2D.show(sm, col_sz=4, which_dim='all', denormalize=True) vhts = BmuHitsView(10, 10, 'Hits Map', text_size=7)
def test_run(): X, y = fetch_california_housing(data_home=TEST_FOLDER, return_X_y=True) data = pd.DataFrame(X) data['target'] = y Pipeline = compose([('scaler', StandardScaler), ('lin_reg', LinearRegression)]) search_spaces = [ SearchSpace(id='Linear Regression', model_class=LinearRegression), SearchSpace(id='Lasso', model_class=Lasso), SearchSpace(id='Pipeline', model_class=Pipeline, parameters_values=dict( scaler__with_mean=[True, False], scaler__with_std=[True, False], lin_reg__fit_intercept=[True, False], lin_reg__normalize=[True, False])) ] config = Config(local_dir=TEST_FOLDER, problem_type='regression', score_function=r2_score, search_spaces=search_spaces, ensemble_id='Ensemble', stagnation=1) engine = Engine(config) train_data, test_data = train_test_split(data, test_size=0.2) train_data_original, test_data_original = train_data.copy( ), test_data.copy() engine.load_train_data(train_data, 'target') engine.load_test_data(test_data) if engine.is_running(): raise AssertionError() engine.restart() sleep(2) if not engine.is_running(): raise AssertionError() sleep(5) status = engine.request_status() if len(status.scores) != len(search_spaces) + 1 or \ len(status.ensemble_weights) != len(search_spaces): raise AssertionError() if status.train_predictions.shape[0] != train_data.shape[0]: raise AssertionError() if status.test_predictions.shape[0] != test_data.shape[0]: raise AssertionError() for base_model in status.base_models.values(): for feature in base_model['features']: if feature not in test_data.columns or feature not in train_data.columns: raise AssertionError() engine.interrupt() if engine.is_running(): raise AssertionError() engine.clean_test_data(restart=True) sleep(5) if not engine.is_running(): raise AssertionError() engine.shuffle_train_data(restart=True) sleep(5) status = engine.request_status() if status.test_predictions is not None: raise AssertionError() engine.interrupt() status.build_report() status.build_report(include_features=True) pd.testing.assert_frame_equal(train_data, train_data_original) pd.testing.assert_frame_equal(test_data, test_data_original)
# models result in more powerful and robust models with less hassle. # # We will start by loading the california housing dataset. We recall that the # goal in this dataset is to predict the median house value in some district # in California based on demographic and geographic data. # %% [markdown] # ```{note} # If you want a deeper overview regarding this dataset, you can refer to the # Appendix - Datasets description section at the end of this MOOC. # ``` # %% from sklearn.datasets import fetch_california_housing data, target = fetch_california_housing(as_frame=True, return_X_y=True) target *= 100 # rescale the target in k$ # %% [markdown] # ```{caution} # Here and later, we use the name `data` and `target` to be explicit. In # scikit-learn documentation, `data` is commonly named `X` and `target` is # commonly called `y`. # %% [markdown] # We will check the statistical performance of decision tree regressor with # default parameters. # %% from sklearn.model_selection import cross_validate from sklearn.tree import DecisionTreeRegressor
import tensorflow as tf import numpy as np from sklearn.datasets import fetch_california_housing from sklearn.preprocessing import StandardScaler n_epochs = 100000 learning_rate = 0.001 housing = fetch_california_housing( data_home="C:/Users/28542/scikit_learn_data", download_if_missing=True) m, n = housing.data.shape print(m, n) housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data] # 可以使用TensorFlow或者Numpy或者sklearn的StandardScaler去进行归一化 # StandardScaler默认就做了方差归一化,和均值归一化,这两个归一化的目的都是为了更快的进行梯度下降 # 你如何构建你的训练集,你训练除了的模型,就具备什么样的功能! scaler = StandardScaler().fit(housing_data_plus_bias) scaled_housing_data_plus_bias = scaler.transform(housing_data_plus_bias) X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X') y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y') # random_uniform函数创建图里一个节点包含随机数值,给定它的形状和取值范围,就像numpy里面rand()函数 theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name='theta') y_pred = tf.matmul(X, theta, name="predictions") error = y_pred - y mse = tf.reduce_mean(tf.square(error), name="mse") # 梯度的公式:(y_pred - y) * xj gradients = 2 / m * tf.matmul(tf.transpose(X), error) # 赋值函数对于BGD来说就是 theta_new = theta - (learning_rate * gradients) training_op = tf.assign(theta, theta - learning_rate * gradients)
from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from tensorflow import keras housing = fetch_california_housing() X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target, random_state=0) X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_valid = scaler.transform(X_valid) X_test = scaler.transform(X_test) # bacis version # input = keras.layers.Input(shape=X_train.shape[1:]) # hidden1 = keras.layers.Dense(30, activation="relu")(input) # hidden2 = keras.layers.Dense(30, activation="relu")(hidden1) # concat = keras.layers.Concatenate()[input, hidden2] # output = keras.layers.Dense(1)(concat) # model = keras.models.Model(input=[input], output=[output]) # send a subset of the features throuph the wide path, and a different subset throuph the deep path input_A = keras.layers.Input(shape=[5]) input_B = keras.layers.Input(shape=[6]) hidden1 = keras.layers.Dense(30, activation="relu")(input_B)
def test_california_housing_oob(): X, y = fetch_california_housing(return_X_y=True) run_regression_test(X, y, min_training_score=.79, grace=0.15, oob=True)
import tensorflow as tf import numpy as np from sklearn.datasets import fetch_california_housing # 立刻下载数据集 housing = fetch_california_housing(data_home="C:/Users/28542/scikit_learn_data", download_if_missing=False) # 获得X数据行数和列数 m, n = housing.data.shape # 这里添加一个额外的bias输入特征(x0=1)到所有的训练数据上面,因为使用的numpy所有会立即执行 housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data] # 创建两个TensorFlow常量节点X和y,去持有数据和标签 X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X') y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y') # 使用一些TensorFlow框架提供的矩阵操作去求theta XT = tf.transpose(X) # 解析解一步计算出最优解 theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y) with tf.Session() as sess: theta_value = theta.eval() # sess.run(theta) print(theta_value)
def test_california_housing(): X, y = fetch_california_housing(return_X_y=True) run_regression_test(X, y, ntrials=10, grace=0.19)
from sklearn.datasets import fetch_california_housing from sklearn.preprocessing import StandardScaler # TensorFlow为我们去计算梯度,但是同时也给了我们更方便的求解方式 # 它提供给我们与众不同的,有创意的一些优化器,包括梯度下降优化器 # 替换前面代码相应的行,并且一切工作正常 # 设定超参数,Grid Search进行栅格搜索,其实说白了就是排列组合找到Loss Function最小的时刻 # 的那组超参数结果 n_epochs = 1000 learning_rate = 0.01 # 读取数据,这里读取数据是一下子就把所有数据交给X,Y节点,所以下面去做梯度下降的时候 # BGD = Batch Gradient Decrease ,如果面向数据集比较大的时候,我们倾向与 Mini GD housing = fetch_california_housing() m, n = housing.data.shape housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data] # 可以使用TensorFlow或者Numpy或者sklearn的StandardScaler去进行归一化 scaler = StandardScaler().fit(housing_data_plus_bias) scaled_housing_data_plus_bias = scaler.transform(housing_data_plus_bias) # 下面部分X,Y最后用placeholder可以改成使用Mini BGD # 构建计算的图 X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X') y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y') # random_uniform函数创建图里一个节点包含随机数值,给定它的形状和取值范围,就像numpy里面rand()函数 theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name='theta') y_pred = tf.matmul(X, theta, name="predictions") error = y_pred - y
from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.datasets import fetch_california_housing from sklearn.impute import SimpleImputer from sklearn.impute import IterativeImputer from sklearn.linear_model import BayesianRidge from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score N_SPLITS = 5 rng = np.random.RandomState(0) X_full, y_full = fetch_california_housing(return_X_y=True) # ~2k samples is enough for the purpose of the example. # Remove the following two lines for a slower run with different error bars. X_full = X_full[::10] y_full = y_full[::10] n_samples, n_features = X_full.shape # Estimate the score on the entire dataset, with no missing values br_estimator = BayesianRidge() score_full_data = pd.DataFrame( cross_val_score( br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS ), columns=["Full Data"], )
parser.add_argument('--test_size', type=float, default=0.1, help="TestSize") args = parser.parse_args() print("\n" + "Arguments are: " + "\n") print(args) import time from tqdm import tqdm # Loading datasets if args.ds == 'boston': X, y = load_boston()['data'], load_boston()['target'] elif args.ds == 'diabetes': X, y = load_diabetes()['data'], load_diabetes()['target'] elif args.ds == 'cali': X, y = fetch_california_housing()['data'], fetch_california_housing()['target'] else: X, y = make_regression(n_samples=args.n_samples, n_features=args.n_feats, noise=args.noise, random_state=0) start = time.time() array_wob, array_wb = experiment_1(X=X, y=y, n_train_iter=args.n_train_iter, n_average=args.n_average, num_N=args.N, test_size=args.test_size) end = time.time() print(str(end - start) + ' seconds') print('finish') with open(f'results_mse_{args.ds}|{args.noise}|{args.n_average}|{args.n_feats}|{args.n_samples}_' f'mlp_test_size={args.test_size}.txt', 'w') as f: f.write(str(0) + '|' + str(np.mean(array_wob)) + '\n') for i, el in zip(range(1, len(array_wb)), array_wb): f.write(str(i) + '|' + str(np.mean(el)) + '\n') f.close()