def test_feature_importance_regression():
    """Test that Gini importance is calculated correctly.

    This test follows the example from [1]_ (pg. 373).

    .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements
       of statistical learning. New York: Springer series in statistics.
    """
    california = fetch_california_housing()
    X, y = california.data, california.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    reg = GradientBoostingRegressor(loss='huber', learning_rate=0.1,
                                    max_leaf_nodes=6, n_estimators=100,
                                    random_state=0)
    reg.fit(X_train, y_train)
    sorted_idx = np.argsort(reg.feature_importances_)[::-1]
    sorted_features = [california.feature_names[s] for s in sorted_idx]

    # The most important feature is the median income by far.
    assert sorted_features[0] == 'MedInc'

    # The three subsequent features are the following. Their relative ordering
    # might change a bit depending on the randomness of the trees and the
    # train / test split.
    assert set(sorted_features[1:4]) == {'Longitude', 'AveOccup', 'Latitude'}
Пример #2
0
def test_krige_housing():
    try:
        housing = fetch_california_housing()
    except PermissionError:
        # This can raise permission error on Appveyor
        pytest.skip('Failed to load california housing dataset')

    # take only first 1000
    p = housing['data'][:1000, :-2]
    x = housing['data'][:1000, -2:]
    target = housing['target'][:1000]

    p_train, p_test, y_train, y_test, x_train, x_test = \
        train_test_split(p, target, x, train_size=0.7,
                         random_state=10)

    for ml_model, krige_method in _methods():

        reg_kr_model = RegressionKriging(regression_model=ml_model,
                                         method=krige_method,
                                         n_closest_points=2)
        reg_kr_model.fit(p_train, x_train, y_train)
        if krige_method == 'ordinary':
            assert reg_kr_model.score(p_test, x_test, y_test) > 0.5
        else:
            assert reg_kr_model.score(p_test, x_test, y_test) > 0.0
Пример #3
0
def load_data_target(name):
    """
    Loads data and target given the name of the dataset.
    """
    if name == "Boston":
        data = load_boston()
    elif name == "Housing":
        data = fetch_california_housing()
        dataset_size = 1000 # this is necessary so that SVR does not slow down too much
        data["data"] = data["data"][:dataset_size]
        data["target"] =data["target"][:dataset_size]
    elif name == "digits":
        data = load_digits()
    elif name == "Climate Model Crashes":
        try:
            data = fetch_mldata("climate-model-simulation-crashes")
        except HTTPError as e:
            url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat"
            data = urlopen(url).read().split('\n')[1:]
            data = [[float(v) for v in d.split()] for d in data]
            samples = np.array(data)
            data = dict()
            data["data"] = samples[:, :-1]
            data["target"] = np.array(samples[:, -1], dtype=np.int)
    else:
        raise ValueError("dataset not supported.")
    return data["data"], data["target"]
def getCaliforniaHousingData(housingFILE):

    print("\ngetCaliforniaHousingData():")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    if os.path.isfile(path = housingFILE) == False:
        print("downloading California housing data set from Internet ...")
        from sklearn.datasets import fetch_california_housing
        with open(file = housingFILE, mode = 'wb') as myFile:
            pickle.dump(obj = fetch_california_housing(), file = myFile)
    else:
        print("loading California housing data set from drive ...")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    with open(file = housingFILE, mode = 'rb') as myFile:
        housing = pickle.load(myFile)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    housingData   = housing.data
    housingTarget = housing.target

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( housingData, housingTarget )
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

N_SPLITS = 5

rng = np.random.RandomState(0)

X_full, y_full = fetch_california_housing(return_X_y=True)
# ~2k samples is enough for the purpose of the example.
# Remove the following two lines for a slower run with different error bars.
X_full = X_full[::10]
y_full = y_full[::10]
n_samples, n_features = X_full.shape

# Estimate the score on the entire dataset, with no missing values
br_estimator = BayesianRidge()
score_full_data = pd.DataFrame(
    cross_val_score(
        br_estimator, X_full, y_full, scoring='neg_mean_squared_error',
        cv=N_SPLITS
    ),
    columns=['Full Data']
)
#
# In the previous notebook, we presented the general cross-validation framework
# and how to assess if a predictive model is underfiting, overfitting, or
# generalizing. Besides these aspects, it is also important to understand how
# the different errors are influenced by the number of samples available.
#
# In this notebook, we will show this aspect by looking a the variability of
# the different errors.
#
# Let's first load the data and create the same model as in the previous
# notebook.

# %%
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing(as_frame=True)
data, target = housing.data, housing.target
target *= 100  # rescale the target in k$

# %% [markdown]
# ```{note}
# If you want a deeper overview regarding this dataset, you can refer to the
# Appendix - Datasets description section at the end of this MOOC.
# ```

# %%
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()

# %% [markdown]
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

Copyright Brian Dolhansky 2014
[email protected]
"""

import numpy as np
from data_utils import split_train_test, RMSE
from linear_regression import LinearRegression
from sklearn import preprocessing
from sklearn.datasets import fetch_california_housing


print "Loading data..."
housing = fetch_california_housing(data_home='/home/bdol/data')
train_data, test_data, train_target, test_target = split_train_test(
    housing.data, housing.target
)

# Normalize the data
train_data = preprocessing.scale(train_data)
test_data = preprocessing.scale(test_data)

# Append bias feature
train_data = np.hstack((train_data, np.ones((train_data.shape[0], 1),
                                            dtype=train_data.dtype)))
test_data = np.hstack((test_data, np.ones((test_data.shape[0], 1),
                                          dtype=test_data.dtype)))

train_target = train_target[:, None]
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

Copyright Brian Dolhansky 2014
[email protected]
"""

import numpy as np
from data_utils import split_train_test, RMSE
from linear_regression import LinearRegression
from sklearn import preprocessing
from sklearn.datasets import fetch_california_housing

print "Loading data..."
housing = fetch_california_housing(data_home='/home/bdol/data')
train_data, test_data, train_target, test_target = split_train_test(
    housing.data, housing.target)

# Normalize the data
train_data = preprocessing.scale(train_data)
test_data = preprocessing.scale(test_data)

# Append bias feature
train_data = np.hstack(
    (train_data, np.ones((train_data.shape[0], 1), dtype=train_data.dtype)))
test_data = np.hstack(
    (test_data, np.ones((test_data.shape[0], 1), dtype=test_data.dtype)))

train_target = train_target[:, None]
test_target = test_target[:, None]
Пример #9
0
# cross-validation could be used.
#
# This exercise will make you implement the same search but using the class
# `GridSearchCV`.
#
# First, we will:
#
# * load the california housing dataset;
# * split the data into a training and testing set;
# * create a machine learning pipeline composed of a standard scaler to
#   normalize the data, and a ridge regression as a linear model.

# %%
from sklearn.datasets import fetch_california_housing

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X.head()

# %%
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# %%
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge

ridge = make_pipeline(StandardScaler(), Ridge())

# %% [markdown]
Пример #10
0
def test_rf_regression(datatype, split_algo, mode, column_info, max_features,
                       rows_sample):

    ncols, n_info = column_info
    use_handle = True

    if mode == 'unit':
        X, y = make_regression(n_samples=500,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=2,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=50,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16,
                       accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if mode != "stress":
        sk_model = skrfr(n_estimators=50,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_predict, convert_dtype=datatype)
        assert fil_r2 >= (sk_r2 - 0.07)
    assert fil_r2 >= (cu_r2 - 0.02)
Пример #11
0
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing

# 获取房价数据集
housing = fetch_california_housing(data_home='./scikit_learn_data',
                                   download_if_missing=True)
# m 行, n 列
m, n = housing.data.shape
feature_names = housing.feature_names
# 20640 8
# ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
print(m, n)
print(feature_names)
print(housing.data[0:3], housing.target[0:3], type(housing.target[0:3]))

# np.c_  将两个矩阵拼接在一起
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]
print(housing_data_plus_bias.shape)
# 创建两个 Tensorflow 的常量节点 X 和 y , 去持有数据和标签
X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X')
# X = tf.constant(housing.data, dtype=tf.float32, name='X')
# 行向量转为列向量
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')
# 使用 Tensorflow 框架提供的矩阵操作操作求 theta
XT = tf.transpose(X)
# 解析解一步计算出最优解
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)
# Tensor("MatMul_2:0", shape=(9, 1), dtype=float32)
print(theta)
Пример #12
0
import time
if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument("--tabularpath", type=str, default='',
                        nargs="?", help="Path of Tabular Data, i.e./.../data.csv")
    # Hyper parameters for conversion
    parser.add_argument("--num_quantile", type=int, default=2, nargs="?",
                        help="q param in https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html")
    parser.add_argument("--min_unique_val_per_column", type=int, default=2, nargs="?",
                        help="Apply Quantile-based discretization function on those columns having at least such "
                             "unique values.")
    args = parser.parse_args()
    # DASK can be applied.
    print('Tabular data is being read')
    try:
        df = pd.read_csv(args.tabularpath)
    except FileNotFoundError:
        print('File not found, We will use california housing dataset from sklearn')
        from sklearn import datasets

        X, y = datasets.fetch_california_housing(return_X_y=True)
        df = pd.DataFrame(X)

    print('Original Tabular data: {0} by {1}'.format(*df.shape))
    print('Quantisation starts')
    X_transformed = QCUT(min_unique_val_per_column=args.min_unique_val_per_column,
                         num_quantile=args.num_quantile).transform(df)
    X_transformed.index = 'Event_' + X_transformed.index.astype(str)
    print('Graph data being generated')
    kg = GraphGenerator().transform(X_transformed)
Пример #13
0
    def california_housing_dataset(batch_size, device):
        """
        This named constructor builds a DataSet from the California Housing dataset.
        """
        from sklearn.datasets import fetch_california_housing

        cal_housing = fetch_california_housing()

        non_outliers = np.ones(cal_housing.data.shape[0], dtype=bool)
        for idx in range(cal_housing.data.shape[1]):
            column = cal_housing.data[:, idx]
            cutoffs = np.percentile(column, (1.0, 99.0))
            non_outliers = np.logical_and(
                non_outliers,
                np.logical_and(column > cutoffs[0], column < cutoffs[1]))
        # cutoffs = np.percentile(cal_housing.target, (1.0, 99.0))
        # non_outliers = np.logical_and(
        #    non_outliers, np.logical_and(cal_housing.target > cutoffs[0],
        #    cal_housing.target < cutoffs[1])
        # )

        cal_housing.data = cal_housing.data[non_outliers]
        cal_housing.target = cal_housing.target[non_outliers]

        x_train, x_test, y_train, y_test = train_test_split(cal_housing.data,
                                                            cal_housing.target,
                                                            test_size=0.2,
                                                            random_state=0)

        non_outliers = np.ones(x_test.shape[0], dtype=bool)
        for idx in range(x_test.shape[1]):
            column = x_test[:, idx]
            cutoffs = np.percentile(column, (5.0, 95.0))
            non_outliers = np.logical_and(
                non_outliers,
                np.logical_and(column > cutoffs[0], column < cutoffs[1]))
        x_test = x_test[non_outliers]
        y_test = y_test[non_outliers]

        x_dimensions = cal_housing.feature_names

        train_size = x_train.shape[0]
        test_size = x_test.shape[0]

        y_train = y_train[..., np.newaxis]
        y_test = y_test[..., np.newaxis]

        params_desc = "train size: {}/test size: {}".format(
            train_size, test_size)

        return DataSets(
            x_train,
            y_train,
            x_test,
            y_test,
            x_dimensions,
            "Price",
            batch_size,
            "california housing dataset",
            params_desc,
            device,
        )
Пример #14
0
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing

# 立刻下载数据集
housing = fetch_california_housing(download_if_missing=True)
# 获得X数据行数和列数
m, n = housing.data.shape
print(m, n)
print(housing.data, housing.target)
print(housing.feature_names)

# 这里添加一个额外的bias输入特征(x0=1)到所有的训练数据上面,因为使用的numpy所以会立即执行
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]
# 创建两个TensorFlow常量节点X和y,去持有数据和标签
X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')
# 使用一些TensorFlow框架提供的矩阵操作去求theta
XT = tf.transpose(X)
# 解析解一步计算出最优解
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)
print(theta)

# with tf.Session() as sess:
#     theta_value = theta.eval()  # sess.run(theta)
#
# print(theta_value)
Пример #15
0
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

#tensorflow多元线性回归梯度下降法求解
n_epochs = 10000
learning_rate = 0.01

housing = fetch_california_housing(data_home="D:/sklearn_data",
                                   download_if_missing=True)
m, n = housing.data.shape
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]
# 可以使用TensorFlow或者Numpy或者sklearn的StandardScaler去进行归一化
# StandardScaler默认就做了方差归一化,和均值归一化,这两个归一化的目的都是为了更快的进行梯度下降
# 你如何构建你的训练集,你训练除了的模型,就具备什么样的功能!
scaler = StandardScaler().fit(housing_data_plus_bias)
scaled_housing_data_plus_bias = scaler.transform(housing_data_plus_bias)

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')

# random_uniform函数创建图里一个节点包含随机数值,给定它的形状和取值范围,就像numpy里面rand()函数
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name='theta')
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
# 梯度的公式:(y_pred - y) * xj
gradients = 2 / m * tf.matmul(tf.transpose(X), error)
# 赋值函数对于BGD来说就是 theta_new = theta - (learning_rate * gradients)
training_op = tf.assign(theta, theta - learning_rate * gradients)
Пример #16
0
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing(data_home="G:\ML\dataset\scikit_learn_data",
                                   download_if_missing=True)
m, n = housing.data.shape
print(m, n)
print(housing.target.shape)
print(housing.target)
#print(housing.data, housing.target)
print(housing.feature_names)
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]
y_pre = housing.target.reshape(-1, 1)
print(y_pre.shape)

X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X')
Y = tf.constant(y_pre, dtype=tf.float32, name='y')
XT = tf.transpose(X)

theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), Y)
with tf.Session() as sess:
    theta_value = theta.eval()
    print(theta_value)
Пример #17
0
from sklearn.datasets import fetch_california_housing

cali = fetch_california_housing()  #bunch object
# print(cali.DESCR)

print(cali.data.shape)
print(cali.target.shape)
print(cali.feature_names)

import pandas as pd

pd.set_option("precision", 4)
pd.set_option("max_columns", 9)  #display up to 9 columns in DataFrame
pd.set_option("display.width", None)  # auto-detect the display width

cali_df = pd.DataFrame(cali.data, columns=cali.feature_names)
cali_df["MedHouseValue"] = pd.Series(cali.target)
print(cali_df.head())

sample_df = cali_df.sample(frac=0.1, random_state=17)

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(font_scale=2)
sns.set_style("whitegrid")

for feature in cali.feature_names:
    plt.figure(figsize=(8, 4.5))  # 8"-by-4.5" figure
    sns.scatterplot(
        data=sample_df,
def main():

    assert not tf.executing_eagerly()

    now = datetime.utcnow().strftime('%Y%m%d%H%M%S')
    root_logdir = 'tf_logs'
    logdir = '{}/run-{}/'.format(root_logdir, now)

    housing = fetch_california_housing()
    m, n = housing.data.shape
    housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]
    scaler = MinMaxScaler()
    housing_data_plus_bias_scaled = scaler.fit_transform(
        housing_data_plus_bias)
    X = housing_data_plus_bias_scaled
    print(X.dtype)
    y = housing.target.reshape(-1, 1)

    n_epochs = 100
    learning_rate = 0.01
    batch_size = 100
    n_batches = int(np.ceil(m / batch_size))
    grad = 'optimizer'

    X_batch = tf.compat.v1.placeholder(tf.float32,
                                       shape=(None, n + 1),
                                       name='X_batch')
    y_batch = tf.compat.v1.placeholder(tf.float32,
                                       shape=(None, 1),
                                       name='y_batch')

    theta = tf.Variable(tf.compat.v1.random_uniform([n + 1, 1], -1.0, 1.0),
                        name="theta")
    y_pred = tf.matmul(X_batch, theta, name="predictions")
    error = y_pred - y_batch
    mse = tf.reduce_mean(tf.square(error), name="mse")

    if grad == 'autodiff':
        gradients = tf.gradients(mse, [theta])[0]
        training_op = tf.compat.v1.assign(theta,
                                          theta - learning_rate * gradients)
    elif grad == 'optimizer':
        optimizer = tf.compat.v1.train.GradientDescentOptimizer(
            learning_rate=learning_rate)
        training_op = optimizer.minimize(mse)
    else:
        gradients = 2 / m * tf.matmul(tf.transpose(X_batch), error)
        training_op = tf.compat.v1.assign(theta,
                                          theta - learning_rate * gradients)

    init = tf.compat.v1.global_variables_initializer()

    mse_summary = tf.compat.v1.summary.scalar('MSE', mse)
    file_writer = tf.compat.v1.summary.FileWriter(
        logdir, tf.compat.v1.get_default_graph())

    # for saving the model
    #saver = tf.compat.v1.train.Saver()

    with tf.compat.v1.Session() as sess:
        sess.run(init)
        for epoch in range(n_epochs):
            #if epoch % 100 == 0:
            #print("Epoch: ", epoch, "MSE: ", mse.eval())
            #save_path = saver.save(sess, 'model.ckpt')
            for batch_index in range(n_batches):
                X_batch_fetched, y_batch_fetched = fetch_batch(
                    X, y, batch_index, batch_size)
                print('Epoch: {}, Batch: {}, MSE: {}'.format(
                    epoch, batch_index,
                    mse.eval(feed_dict={
                        X_batch: X_batch_fetched,
                        y_batch: y_batch_fetched
                    })))

                if batch_index % 10 == 0:
                    summary_str = mse_summary.eval(feed_dict={
                        X_batch: X_batch_fetched,
                        y_batch: y_batch_fetched
                    })
                    step = epoch * n_batches + batch_index
                    file_writer.add_summary(summary_str, step)

                sess.run(training_op,
                         feed_dict={
                             X_batch: X_batch_fetched,
                             y_batch: y_batch_fetched
                         })

        best_theta = theta.eval()
from matplotlib import pyplot as plt
from matplotlib import cm

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer

from sklearn.datasets import fetch_california_housing

print(__doc__)

dataset = fetch_california_housing()
X_full, y_full = dataset.data, dataset.target

# Take only 2 features to make visualization easier
# Feature of 0 has a long tail distribution.
# Feature 5 has a few but very large outliers.

X = X_full[:, [0, 5]]

distributions = [
    ('Unscaled data', X),
    ('Data after standard scaling',
        StandardScaler().fit_transform(X)),
    ('Data after min-max scaling',
        MinMaxScaler().fit_transform(X)),
    ('Data after max-abs scaling',
Пример #20
0
def fetch(*args, **kwargs):
    return fetch_california_housing(*args, download_if_missing=False, **kwargs)
Пример #21
0
def get_dataset(dataset_name='breast_cancer'):
    """Retrieve one of the standard datasets in sklearn.

    :param dataset_name: the dataset name to use from sklearn. Valid values are 
        `breast_cancer`, `digits`, `iris`, `wine` for classification and 
        `boston`, `diabetes` and `california` for regression. Default is `breast_cancer`.
    :type dataset_name: str
    
    :return: Five variables are returned. First is the dataset itself without
        the target values; second includes the target values; third has
        all the categorical columns; fourth has all the integer columns and
        the last informs if it is a classification problem (True) or a regression
        problem (False).
    :rtype: np.array, np.array, np.array, np.array, Boolean
    """

    if dataset_name == 'breast_cancer':
        # loading the dataset
        X, y = load_breast_cancer(return_X_y=True)

        # informing categorical columns and their available values
        categorical_columns = {}
        integer_columns = []
        is_classification = True
    elif dataset_name == 'digits':
        # loading the dataset
        X, y = load_digits(return_X_y=True)

        # informing categorical columns and their available values
        categorical_columns = {}
        integer_columns = list(range(64))
        is_classification = True
    elif dataset_name == 'iris':
        # loading the dataset
        X, y = load_iris(return_X_y=True)

        # informing categorical columns and their available values
        categorical_columns = {}
        integer_columns = []
        is_classification = True
    elif dataset_name == 'wine':
        # loading the dataset
        X, y = load_wine(return_X_y=True)

        # informing categorical columns and their available values
        categorical_columns = {}
        integer_columns = [4, 12]
        is_classification = True
    elif dataset_name == 'boston':
        X, y = load_boston(return_X_y=True)

        # informing categorical columns and their available values
        categorical_columns = {3: [0, 1]}
        integer_columns = [8, 9]
        is_classification = False
    elif dataset_name == 'diabetes':
        # loading the dataset
        X, y = load_diabetes(return_X_y=True)

        # informing categorical columns and their available values
        categorical_columns = {1: [0.05068012, -0.04464164]}
        integer_columns = []
        is_classification = False
    elif dataset_name == 'california':
        # loading the dataset
        X, y = fetch_california_housing(return_X_y=True)

        # informing categorical columns and their available values
        categorical_columns = {}
        integer_columns = [1, 4]
        is_classification = False

    return X, y, categorical_columns, integer_columns, is_classification
Пример #22
0
from matplotlib import pyplot as plt
from matplotlib import cm

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer

from sklearn.datasets import fetch_california_housing

print(__doc__)

dataset = fetch_california_housing()
X_full, y_full = dataset.data, dataset.target

# Take only 2 features to make visualization easier
# Feature of 0 has a long tail distribution.
# Feature 5 has a few but very large outliers.

X = X_full[:, [0, 5]]

distributions = [
    ('Unscaled data', X),
    ('Data after standard scaling',
        StandardScaler().fit_transform(X)),
    ('Data after min-max scaling',
        MinMaxScaler().fit_transform(X)),
    ('Data after max-abs scaling',
Пример #23
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# define the parameter space that will be searched over
param_distributions = {
    'n_estimators': randint(1, 5),
    'max_depth': randint(5, 10)
}

# now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter=5,
                            param_distributions=param_distributions,
                            random_state=0)
search.fit(X_train, y_train)

print(search.best_params_)

# the search object now acts like a normal random forest estimator
# with max_depth=9 and n_estimators=4
print(search.score(X_test, y_test))
Пример #24
0
First, we'll load some packages.
'''
# -

import pandas as pd
import xgboost
import shap
from sklearn import datasets as ds

# + [markdown]
'''
Next, we'll load some data.  Let's use the California housing data set.
'''

# +
calif_house_data = ds.fetch_california_housing()

print('\n')
print(calif_house_data['data'])
print('\n')
print(calif_house_data['target'])
print('\n')
print(calif_house_data['feature_names'])
print('\n')
print(calif_house_data['DESCR'])
print('\n')
print('Features - # rows, # columns:', calif_house_data['data'].shape)
print('\n')
print('Target variable - # rows:', calif_house_data['target'].shape)

# + [markdown]
Пример #25
0
from sklearn.datasets import fetch_california_housing


california = fetch_california_housing() # Bunch object
'''
print(california.DESCR)

print(california.data.shape)

print(california.target.shape)

print(california.feature_names)
'''
import pandas as pd

pd.set_option("precision", 4) # 4 digitit precision for floats.

pd.set_option("max_columns", 9) # display up to 9 columns in DataFrame outputs

pd.set_option("display.width", None) #auto-detect the display width for wrapping

#creates the initial DataFrame using the data in california.data and with the
# column names specified based on the features of the sample
california_df = pd.DataFrame(california.data, columns=california.feature_names)

# add a column to the dataframe for the mdian house values stored in california.target
california_df["MedHouseValue"] = pd.Series(california.target)

print(california_df.head())

# using the describe method of dataframes we can get some statistical information
Пример #26
0
def load_dataset(name,
                 num_features=5,
                 random_state=42,
                 flatten=False,
                 show_corr_matrix=True,
                 show_subplots=True):
    '''
  Args:
    name (str): name of dataset ('mnist', 'fmnist', 'iris', 'breast_cancer', 'diabetes', 'housing')
    num_features (int): number of features to view in subplots, if None then num_features includes all features
    random_state (int): specify random state
    flatten (bool): returns image with shape (-1, 28, 28) if True, else returns image with shape (-1, 784)
    show_corr_matrix (bool): whether to show correlation matrix
    show_subplots (bool): whether to show subplots comparing the num_features

  Returns:
    (X_train, y_train): training data (numpy arrays)
    (X_test, y_test): testing data (20% of total data)
    col_names (list): feature names
  '''
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    if name == 'mnist':
        (X_train, y_train), (X_test, y_test) = mnist.load_data()
        if flatten:
            X_train = X_train.reshape(-1, 784)
            X_test = X_test.reshape(-1, 784)
            X_train = X_train / 255.0
            X_test = X_test / 255.0
        return (X_train, y_train), (X_test, y_test)

    elif name == 'fashion_mnist' or name == 'fmnist':
        (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

        if flatten:
            X_train = X_train.reshape(-1, 784)
            X_test = X_test.reshape(-1, 784)
        return (X_train, y_train), (X_test, y_test)

    elif name == 'iris':
        data = datasets.load_iris()
        X = data.data
        y = data.target
        col_names = data.feature_names
        df = pd.DataFrame(X, columns=col_names)
        num = num_features
        if num_features == None:
            num = len(col_names)

        if show_subplots:
            plot_subplots(df, col_names, num)
        if show_corr_matrix:
            corr_img = correlations(df, col_names)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)

    elif name == 'cancer' or name == 'breast_cancer':
        data = datasets.load_breast_cancer()
        X = data.data
        y = data.target
        col_names = data.feature_names
        df = pd.DataFrame(X, columns=col_names)
        num = num_features
        if num_features == None:
            num = len(col_names)
        if show_subplots:
            plot_subplots(df, col_names, num)
        if show_corr_matrix:
            corr_img = correlations(df, col_names)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)

    elif name == 'diabetes':
        data = datasets.load_diabetes()
        X = data.data
        y = data.target
        col_names = data.feature_names
        df = pd.DataFrame(X, columns=col_names)
        num = num_features
        if num_features == None:
            num = len(col_names)

        if show_subplots:
            plot_subplots(df, col_names, num)
        if show_corr_matrix:
            corr_img = correlations(df, col_names)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)

    elif name == 'house' or name == 'california_housing' or name == 'housing':
        data = datasets.fetch_california_housing()
        X = data.data
        y = data.target
        col_names = data.feature_names
        df = pd.DataFrame(X, columns=col_names)
        num = num_features
        if num_features == None:
            num = len(col_names)
        if show_subplots:
            plot_subplots(df, col_names, num)
        if show_corr_matrix:
            corr_img = correlations(df, col_names)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)
        if show_corr_matrix:
            plt.title("Correlation Matrix")
    return (X_train, y_train), (X_test, y_test), col_names
# regression MLP model
import tensorflow as tf
from sklearn import model_selection, preprocessing
from sklearn import datasets

# Using the sklearn california housing data
housing = datasets.fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = model_selection.train_test_split(
    housing.data, housing.target)
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
    X_train_full, y_train_full)

# Standard scaling the data
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Creating a simple model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(30, activation="selu",
                          input_shape=X_train.shape[1:]),
    tf.keras.layers.Dense(10, activation="selu"),
    tf.keras.layers.Dense(1)
])

# Compiling using huber loss
model.compile(loss=tf.keras.losses.Huber(), optimizer="adam")

# training
history = model.fit(X_train,
Пример #28
0
                metrics = metric(torch.clamp(y_hat, min=0),
                                 torch.clamp(y[:, 1], min=0), **metric_params)
            else:
                metrics = metric(y_hat, y[:, 1], **metric_params)
            self.log(
                f"{tag}_{metric_str}",
                metrics,
                on_epoch=True,
                on_step=False,
                logger=True,
                prog_bar=True,
            )
        return metrics


dataset = fetch_california_housing(data_home="data", as_frame=True)
dataset.frame["HouseAgeBin"] = pd.qcut(dataset.frame["HouseAge"], q=4)
dataset.frame.HouseAgeBin = "age_" + dataset.frame.HouseAgeBin.cat.codes.astype(
    str)

test_idx = dataset.frame.sample(int(0.2 * len(dataset.frame)),
                                random_state=42).index
test = dataset.frame[dataset.frame.index.isin(test_idx)]
train = dataset.frame[~dataset.frame.index.isin(test_idx)]

epochs = 15
batch_size = 128
steps_per_epoch = int((len(train) // batch_size) * 0.9)
data_config = DataConfig(
    target=["HouseAgeBin"] + dataset.target_names,
    continuous_cols=[
Пример #29
0
import pylab                                            as pl
import numpy                                            as np

from   matplotlib                  import pyplot        as plt
from   sompy.sompy                 import SOMFactory
from   sklearn.datasets            import fetch_california_housing
from   sompy.visualization.mapview import View2D
from   sompy.visualization.bmuhits import BmuHitsView


data  = fetch_california_housing()
descr = data.DESCR
names = fetch_california_housing().feature_names+["HouseValue"]

data  = np.column_stack([data.data, data.target])

print(descr)
print( "FEATURES: ", ", ".join(names))

sm    = SOMFactory().build(data, normalization = 'var', initialization='random', component_names=names)
sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5)

topographic_error  = sm.calculate_topographic_error()
quantization_error = np.mean(sm._bmu[1])

print ("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error))

view2D  = View2D(10, 10, 'rand data', text_size=10)
view2D.show(sm, col_sz=4, which_dim='all', denormalize=True)

vhts    = BmuHitsView(10, 10, 'Hits Map', text_size=7)
Пример #30
0
def test_run():
    X, y = fetch_california_housing(data_home=TEST_FOLDER, return_X_y=True)
    data = pd.DataFrame(X)
    data['target'] = y

    Pipeline = compose([('scaler', StandardScaler),
                        ('lin_reg', LinearRegression)])

    search_spaces = [
        SearchSpace(id='Linear Regression', model_class=LinearRegression),
        SearchSpace(id='Lasso', model_class=Lasso),
        SearchSpace(id='Pipeline',
                    model_class=Pipeline,
                    parameters_values=dict(
                        scaler__with_mean=[True, False],
                        scaler__with_std=[True, False],
                        lin_reg__fit_intercept=[True, False],
                        lin_reg__normalize=[True, False]))
    ]

    config = Config(local_dir=TEST_FOLDER,
                    problem_type='regression',
                    score_function=r2_score,
                    search_spaces=search_spaces,
                    ensemble_id='Ensemble',
                    stagnation=1)

    engine = Engine(config)

    train_data, test_data = train_test_split(data, test_size=0.2)

    train_data_original, test_data_original = train_data.copy(
    ), test_data.copy()

    engine.load_train_data(train_data, 'target')
    engine.load_test_data(test_data)

    if engine.is_running():
        raise AssertionError()

    engine.restart()

    sleep(2)

    if not engine.is_running():
        raise AssertionError()

    sleep(5)

    status = engine.request_status()

    if len(status.scores) != len(search_spaces) + 1 or \
            len(status.ensemble_weights) != len(search_spaces):
        raise AssertionError()

    if status.train_predictions.shape[0] != train_data.shape[0]:
        raise AssertionError()

    if status.test_predictions.shape[0] != test_data.shape[0]:
        raise AssertionError()

    for base_model in status.base_models.values():
        for feature in base_model['features']:
            if feature not in test_data.columns or feature not in train_data.columns:
                raise AssertionError()

    engine.interrupt()

    if engine.is_running():
        raise AssertionError()

    engine.clean_test_data(restart=True)

    sleep(5)

    if not engine.is_running():
        raise AssertionError()

    engine.shuffle_train_data(restart=True)

    sleep(5)

    status = engine.request_status()

    if status.test_predictions is not None:
        raise AssertionError()

    engine.interrupt()

    status.build_report()
    status.build_report(include_features=True)

    pd.testing.assert_frame_equal(train_data, train_data_original)
    pd.testing.assert_frame_equal(test_data, test_data_original)
Пример #31
0
# models result in more powerful and robust models with less hassle.
#
# We will start by loading the california housing dataset. We recall that the
# goal in this dataset is to predict the median house value in some district
# in California based on demographic and geographic data.

# %% [markdown]
# ```{note}
# If you want a deeper overview regarding this dataset, you can refer to the
# Appendix - Datasets description section at the end of this MOOC.
# ```

# %%
from sklearn.datasets import fetch_california_housing

data, target = fetch_california_housing(as_frame=True, return_X_y=True)
target *= 100  # rescale the target in k$

# %% [markdown]
# ```{caution}
# Here and later, we use the name `data` and `target` to be explicit. In
# scikit-learn documentation, `data` is commonly named `X` and `target` is
# commonly called `y`.

# %% [markdown]
# We will check the statistical performance of decision tree regressor with
# default parameters.

# %%
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor
Пример #32
0
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

n_epochs = 100000
learning_rate = 0.001

housing = fetch_california_housing(
    data_home="C:/Users/28542/scikit_learn_data", download_if_missing=True)
m, n = housing.data.shape
print(m, n)
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]
# 可以使用TensorFlow或者Numpy或者sklearn的StandardScaler去进行归一化
# StandardScaler默认就做了方差归一化,和均值归一化,这两个归一化的目的都是为了更快的进行梯度下降
# 你如何构建你的训练集,你训练除了的模型,就具备什么样的功能!
scaler = StandardScaler().fit(housing_data_plus_bias)
scaled_housing_data_plus_bias = scaler.transform(housing_data_plus_bias)

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')

# random_uniform函数创建图里一个节点包含随机数值,给定它的形状和取值范围,就像numpy里面rand()函数
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name='theta')
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
# 梯度的公式:(y_pred - y) * xj
gradients = 2 / m * tf.matmul(tf.transpose(X), error)
# 赋值函数对于BGD来说就是 theta_new = theta - (learning_rate * gradients)
training_op = tf.assign(theta, theta - learning_rate * gradients)
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras

housing = fetch_california_housing()

X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data,
                                                              housing.target,
                                                              random_state=0)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full,
                                                      y_train_full)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

# bacis version
# input = keras.layers.Input(shape=X_train.shape[1:])
# hidden1 = keras.layers.Dense(30, activation="relu")(input)
# hidden2 = keras.layers.Dense(30, activation="relu")(hidden1)
# concat = keras.layers.Concatenate()[input, hidden2]
# output = keras.layers.Dense(1)(concat)
# model = keras.models.Model(input=[input], output=[output])

# send a subset of the features throuph the wide path, and a different subset throuph the deep path
input_A = keras.layers.Input(shape=[5])
input_B = keras.layers.Input(shape=[6])
hidden1 = keras.layers.Dense(30, activation="relu")(input_B)
Пример #34
0
def test_california_housing_oob():
    X, y = fetch_california_housing(return_X_y=True)
    run_regression_test(X, y, min_training_score=.79, grace=0.15, oob=True)
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing

# 立刻下载数据集
housing = fetch_california_housing(data_home="C:/Users/28542/scikit_learn_data", download_if_missing=False)
# 获得X数据行数和列数
m, n = housing.data.shape
# 这里添加一个额外的bias输入特征(x0=1)到所有的训练数据上面,因为使用的numpy所有会立即执行
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]
# 创建两个TensorFlow常量节点X和y,去持有数据和标签
X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')
# 使用一些TensorFlow框架提供的矩阵操作去求theta
XT = tf.transpose(X)
# 解析解一步计算出最优解
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)
with tf.Session() as sess:
    theta_value = theta.eval()  # sess.run(theta)
    print(theta_value)

Пример #36
0
def test_california_housing():
    X, y = fetch_california_housing(return_X_y=True)
    run_regression_test(X, y, ntrials=10, grace=0.19)
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler


# TensorFlow为我们去计算梯度,但是同时也给了我们更方便的求解方式
# 它提供给我们与众不同的,有创意的一些优化器,包括梯度下降优化器
# 替换前面代码相应的行,并且一切工作正常

# 设定超参数,Grid Search进行栅格搜索,其实说白了就是排列组合找到Loss Function最小的时刻
# 的那组超参数结果
n_epochs = 1000
learning_rate = 0.01

# 读取数据,这里读取数据是一下子就把所有数据交给X,Y节点,所以下面去做梯度下降的时候
#   BGD = Batch Gradient Decrease ,如果面向数据集比较大的时候,我们倾向与 Mini GD
housing = fetch_california_housing()
m, n = housing.data.shape
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]
# 可以使用TensorFlow或者Numpy或者sklearn的StandardScaler去进行归一化
scaler = StandardScaler().fit(housing_data_plus_bias)
scaled_housing_data_plus_bias = scaler.transform(housing_data_plus_bias)

# 下面部分X,Y最后用placeholder可以改成使用Mini BGD
# 构建计算的图
X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')

# random_uniform函数创建图里一个节点包含随机数值,给定它的形状和取值范围,就像numpy里面rand()函数
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name='theta')
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

N_SPLITS = 5

rng = np.random.RandomState(0)

X_full, y_full = fetch_california_housing(return_X_y=True)
# ~2k samples is enough for the purpose of the example.
# Remove the following two lines for a slower run with different error bars.
X_full = X_full[::10]
y_full = y_full[::10]
n_samples, n_features = X_full.shape

# Estimate the score on the entire dataset, with no missing values
br_estimator = BayesianRidge()
score_full_data = pd.DataFrame(
    cross_val_score(
        br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
    ),
    columns=["Full Data"],
)
Пример #39
0
    parser.add_argument('--test_size', type=float, default=0.1, help="TestSize")

    args = parser.parse_args()
    print("\n" + "Arguments are: " + "\n")
    print(args)
    import time
    from tqdm import tqdm

    # Loading datasets

    if args.ds == 'boston':
        X, y = load_boston()['data'], load_boston()['target']
    elif args.ds == 'diabetes':
        X, y = load_diabetes()['data'], load_diabetes()['target']
    elif args.ds == 'cali':
        X, y = fetch_california_housing()['data'], fetch_california_housing()['target']
    else:
        X, y = make_regression(n_samples=args.n_samples, n_features=args.n_feats, noise=args.noise, random_state=0)
    start = time.time()
    array_wob, array_wb = experiment_1(X=X, y=y, n_train_iter=args.n_train_iter, n_average=args.n_average,
                                       num_N=args.N, test_size=args.test_size)
    end = time.time()
    print(str(end - start) + ' seconds')
    print('finish')
    with open(f'results_mse_{args.ds}|{args.noise}|{args.n_average}|{args.n_feats}|{args.n_samples}_'
              f'mlp_test_size={args.test_size}.txt', 'w') as f:
        f.write(str(0) + '|' + str(np.mean(array_wob)) + '\n')
        for i, el in zip(range(1, len(array_wb)), array_wb):
            f.write(str(i) + '|' + str(np.mean(el)) + '\n')
        f.close()
def fetch(*args, **kwargs):
    return fetch_california_housing(*args, download_if_missing=False, **kwargs)