def test_load_boston(): res = load_boston() assert_equal(res.data.shape, (506, 13)) assert_equal(res.target.size, 506) assert_equal(res.feature_names.size, 13) assert_true(res.DESCR) # test return_X_y option X_y_tuple = load_boston(return_X_y=True) bunch = load_boston() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target)
def main(): boston = datasets.load_boston() y = boston.target # House prices mean = np.mean(y) y = y > mean # y now means is_above_average_house_price fns = boston.feature_names predictors = np.array([ 'NOX', # Air concentration of nitrous-oxide 'CRIM', # Crime rate per capita ]) X_idx = np.in1d(fns, predictors) X = boston.data[:, X_idx] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=33) for p, x in zip(predictors, np.rollaxis(X, 1)): print('%s vs House price - srcc: %f, p_value: %f' % ( (p, ) + stats.spearmanr(x, y))) model = GaussianNB() model.fit(X_train, y_train) y_hat = model.predict(X_test) matches = y_hat == y_test print('Success rate: %i / %i = %f' % ( matches.sum(), matches.size, float(matches.sum()) / matches.size))
def boston(): dataset = load_boston() X, y = dataset.data, dataset.target # X, y = make_regression(n_samples=100000, n_features=13) X = StandardScaler().fit_transform(X).astype(np.float32) y = y.reshape(-1, 1).astype(np.float32) return shuffle(X, y, random_state=42)
def test_continue_train(self): X, y = load_boston(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'regression', 'metric': 'l1', 'verbose': -1 } lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False) init_gbm = lgb.train(params, lgb_train, num_boost_round=20) model_name = 'model.txt' init_gbm.save_model(model_name) evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=30, valid_sets=lgb_eval, verbose_eval=False, # test custom eval metrics feval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)), evals_result=evals_result, init_model='model.txt') ret = mean_absolute_error(y_test, gbm.predict(X_test)) self.assertLess(ret, 3.5) self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5) for l1, mae in zip(evals_result['valid_0']['l1'], evals_result['valid_0']['mae']): self.assertAlmostEqual(l1, mae, places=5) os.remove(model_name)
def get_cmap_scatter_plot(): boston = datasets.load_boston() prices = boston['target'] lower_status = boston['data'][:,-1] nox = boston['data'][:,4] x, y = get_data_sources(x=lower_status, y=prices) x_mapper, y_mapper = get_mappers(x, y) color_source = ArrayDataSource(nox) color_mapper = dc.reverse(dc.RdYlGn)( DataRange1D(low=nox.min(), high=nox.max()) ) scatter_plot = ColormappedScatterPlot( index=x, value=y, index_mapper=x_mapper, value_mapper=y_mapper, color_data=color_source, color_mapper=color_mapper, marker='circle', title='Color represents nitric oxides concentration', render_method='bruteforce', **PLOT_DEFAULTS ) add_axes(scatter_plot, x_label='Percent lower status in the population', y_label='Median house prices') return scatter_plot
def test_regressors_int(): # test if regressors can cope with integer labels (by converting them to # float) regressors = all_estimators(type_filter='regressor') boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) X = StandardScaler().fit_transform(X) y = np.random.randint(2, size=X.shape[0]) for name, Reg in regressors: if Reg in dont_test or Reg in (CCA,): continue # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds reg1 = Reg() reg2 = Reg() set_random_state(reg1) set_random_state(reg2) if Reg in (_PLS, PLSCanonical, PLSRegression): y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))]) y_ = y_.T else: y_ = y # fit reg1.fit(X, y_) pred1 = reg1.predict(X) reg2.fit(X, y_.astype(np.float)) pred2 = reg2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def test_regression_with_custom_objective(): tm._skip_if_no_sklearn() from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.cross_validation import KFold def objective_ls(y_true, y_pred): grad = (y_pred - y_true) hess = np.ones(len(y_true)) return grad, hess boston = load_boston() y = boston['target'] X = boston['data'] kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) for train_index, test_index in kf: xgb_model = xgb.XGBRegressor(objective=objective_ls).fit( X[train_index], y[train_index] ) preds = xgb_model.predict(X[test_index]) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 # Test that the custom objective function is actually used class XGBCustomObjectiveException(Exception): pass def dummy_objective(y_true, y_pred): raise XGBCustomObjectiveException() xgb_model = xgb.XGBRegressor(objective=dummy_objective) np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y)
def load_data(): """Load the Boston dataset.""" boston = datasets.load_boston() return boston
def get_messy_data(df): r"""Function for testing... just messes up the input data with Nans and Infs. Parameters ---------- df : a pandas data frame Returns ------- df a messy pandas data frame. """ # Put one 'nan' in 25% of the rows, then one inf in 25%. df = df.as_matrix() from sklearn import datasets rng = np.random.RandomState(2) dataset = datasets.load_boston() n_samples = df.shape[0] n_features = df.shape[1] missing_rate = 0.05 n_missing_samples = int(np.floor(n_samples * missing_rate)) missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples), np.ones(n_missing_samples))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) X_missing = df.copy() X_missing[np.where(missing_samples)[0], missing_features] = np.nan rng.shuffle(missing_samples) X_missing[np.where(missing_samples)[0], missing_features] = np.inf return X_missing
def test_regressors_train(): estimators = all_estimators() regressors = [(name, E) for name, E in estimators if issubclass(E, RegressorMixin)] boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) # TODO: test with intercept # TODO: test with multiple responses X = Scaler().fit_transform(X) y = Scaler().fit_transform(y) for name, Reg in regressors: if Reg in dont_test or Reg in meta_estimators: continue # catch deprecation warnings with warnings.catch_warnings(record=True): reg = Reg() if hasattr(reg, 'alpha'): reg.set_params(alpha=0.01) # raises error on malformed input for fit assert_raises(ValueError, reg.fit, X, y[:-1]) # fit reg.fit(X, y) reg.predict(X) assert_greater(reg.score(X, y), 0.5)
def bokeh_plot(crime,zn,inidus,optradio,nox,rm,age,dis,rad,tax,ptratio,Bk,lstat): from django.conf import settings import os from sklearn.externals import joblib from sklearn import datasets from bokeh.plotting import figure, show, output_file from bokeh.resources import CDN from bokeh.embed import components clf=joblib.load(os.path.join(settings.PROJECT_ROOT,'app','machine_SVR.pkl')) boston = datasets.load_boston() y = boston.target Y=SVR_fitting(crime,zn,inidus,optradio,nox,rm,age,dis,rad,tax,ptratio,Bk,lstat) predicted = clf.predict(boston.data) predict_y=Y p = figure(title = "Boston dataset") p.xaxis.axis_label = 'Measured' p.yaxis.axis_label = 'Predicted' p.scatter(y,predicted) p.asterisk(x=predict_y, y=predict_y, size=20, color="#F0027F") script, div = components(p, CDN) return script, div
def main(): # boston data sets boston = datasets.load_boston() # 部屋数 rooms = boston.data[:,5] # 家の値段 house_prices = boston.target # 部屋の数と家の値段の関係をプロットする。 plt.scatter(rooms, house_prices, color='r') # 最小二乗法で誤差が最も少なくなる直線を得る # x = np.array([rooms],np.one(len(rooms))).T x = np.array([[v, 1] for v in rooms]) # バイアス項を追加する y = house_prices # print np.ones_like(rooms) # 最小二乗法で誤差が最も少なくなる直線を得る (slope,bias), total_error, _, _ = np.linalg.lstsq(x, y) # 得られた直線をプロットする plt.plot(x[:, 0], slope * x[:, 0] + bias) # plt.xlabel('部屋の数') # plt.ylabel('家の値段 (単位: 1000 ドル)') plt.grid() plt.xlabel('rooms') plt.ylabel('price') plt.show()
def test_boston_housing_regression_with_sample_weights(): tm._skip_if_no_sklearn() from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.cross_validation import KFold boston = load_boston() y = boston['target'] X = boston['data'] sample_weight = np.ones_like(y, 'float') kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) for train_index, test_index in kf: xgb_model = xgb.XGBRegressor().fit( X[train_index], y[train_index], sample_weight=sample_weight[train_index] ) preds = xgb_model.predict(X[test_index]) # test other params in XGBRegressor().fit preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 assert mean_squared_error(preds2, labels) < 370 assert mean_squared_error(preds3, labels) < 25 assert mean_squared_error(preds4, labels) < 370
def demo(X = None, y = None, test_size = 0.1): if X == None: boston = load_boston() X = pd.DataFrame(boston.data) y = pd.DataFrame(boston.target) base_estimator = DecisionTreeRegressor(max_depth = 5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape # If you want to compare with BaggingRegressor. # bench = BaggingRegressor(base_estimator = base_estimator, n_estimators = 10, max_samples = 1, oob_score = True).fit(X_train, y_train) # print bench.score(X_test, y_test) # print mean_squared_error(bench.predict(X_test), y_test) clf = BasicSegmenterEG_FEMPO(ngen=30,init_sample_percentage = 1, n_votes=10, n = 10, base_estimator = base_estimator, unseen_x = X_test, unseen_y = y_test) clf.fit(X_train, y_train) print clf.score(X_test,y_test) y = clf.predict(X_test) print mean_squared_error(y, y_test) print y.shape return clf, X_test, y_test
def test_RFECV(): from sklearn.datasets import load_boston from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_iris from sklearn.feature_selection import RFECV # Regression X, y = load_boston(return_X_y=True) bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1, n_estimators=10, n_jobs=1, objective='reg:squarederror', random_state=0, verbosity=0) rfecv = RFECV( estimator=bst, step=1, cv=3, scoring='neg_mean_squared_error') rfecv.fit(X, y) # Binary classification X, y = load_breast_cancer(return_X_y=True) bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1, n_estimators=10, n_jobs=1, objective='binary:logistic', random_state=0, verbosity=0) rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='roc_auc') rfecv.fit(X, y) # Multi-class classification X, y = load_iris(return_X_y=True) bst = xgb.XGBClassifier(base_score=0.4, booster='gblinear', learning_rate=0.1, n_estimators=10, n_jobs=1, objective='multi:softprob', random_state=0, reg_alpha=0.001, reg_lambda=0.01, scale_pos_weight=0.5, verbosity=0) rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='neg_log_loss') rfecv.fit(X, y)
def load_extended_boston(): boston = load_boston() X = boston.data X = MinMaxScaler().fit_transform(boston.data) X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X) return X, boston.target
def load_boston_df(include_tgt=True, tgt_name="target", shuffle=False): """Loads the boston housing dataset into a dataframe with the target set as the "target" feature or whatever name is specified in ``tgt_name``. Parameters ---------- include_tgt : bool, optional (default=True) Whether to include the target tgt_name : str, optional (default="target") The name of the target feature shuffle : bool, optional (default=False) Whether to shuffle the rows Returns ------- X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features) The loaded dataset """ bo = load_boston() X = pd.DataFrame.from_records(data=bo.data, columns=bo.feature_names) if include_tgt: X[tgt_name] = bo.target return X if not shuffle else shuffle_dataframe(X)
def test_boston_housing_regression(): from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.model_selection import KFold boston = load_boston() y = boston['target'] X = boston['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) # test other params in XGBRegressor().fit preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 assert mean_squared_error(preds2, labels) < 350 assert mean_squared_error(preds3, labels) < 25 assert mean_squared_error(preds4, labels) < 350
def overview(): boston = load_boston() features = [ [0, 'CRIM', "per capita crime rate by town"], [1, 'ZN', "proportion of residential land zoned for lots over 25,000 sq.ft."], [2, 'INDUS', "proportion of non-retail business acres per town"], [3, 'CHAS', "Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)"], [4, 'NOX', "nitric oxides concentration (parts per 10 million)"], [5, 'RM', "average number of rooms per dwelling"], [6, 'AGE', "proportion of owner-occupied units built prior to 1940"], [6, 'DIS', "weighted distances to five Boston employment centres"], [7, 'RAD', "index of accessibility to radial highways"], [8, 'TAX', "full-value property-tax rate per $10,000"], [9, 'PTRATIO', "pupil-teacher ratio by town"], [10, 'B', "1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town"], [11, 'LSTAT', "% lower status of the population"], [12, 'MEDV', "Median value of owner-occupied homes in $1000's"], ] plot_row = 4 plot_col = 4 plt.figure(figsize=(10, 10)) for f in features: print '{}:\t{}'.format(f[1], f[2]) for feature in features: # plt.subplot(行数, 列数, 何番目のプロットか) plt.subplot(plot_row, plot_col, feature[0] + 1) plt.scatter(boston.data[:, feature[0]], boston.target) plt.xlabel(feature[1]) plt.tight_layout()
def base_stats(): boston = load_boston() # print boston.feature_names # print boston.DESCR x = boston.data y = boston.target lr = LinearRegression() lr.fit(x, y) rmse = np.sqrt(lr.residues_/len(x)) print 'RMSE: {}'.format(rmse) # plt.subplot(行数, 列数, 何番目のプロットか) plt.subplot(2, 1, 1) plt.scatter(lr.predict(x), boston.target) plt.plot([0, 50], [0, 50], '-', color=(.9, .3, .3), lw=4) plt.xlabel('predicted') plt.ylabel('real') x = np.array([np.concatenate((v, [1])) for v in boston.data]) y = boston.target s, total_error, _, _ = np.linalg.lstsq(x, y) rmse = np.sqrt(total_error[0] / len(x)) print 'Residual: {}'.format(rmse) plt.subplot(2, 1, 2) plt.plot(np.dot(x, s), boston.target, 'ro') plt.plot([0, 50], [0, 50], 'g-') plt.ylabel('real')
def main(): # ボストンデータセットを読み込む boston = datasets.load_boston() # 部屋の数 rooms = boston.data[:, 5] # 家の値段 house_prices = boston.target plt.scatter(rooms, house_prices, color="r") # 最小二乗法で誤差が最も少なくなる直線を得る x = np.array([[v, 1] for v in rooms]) # バイアス項を追加する y = house_prices (slope, bias), total_error, _, _ = np.linalg.lstsq(x, y) # 得られた直線をプロットする plt.plot(x[:, 0], slope * x[:, 0] + bias) # 訓練誤差の RMSE rmse = np.sqrt(total_error[0] / len(x)) msg = "RMSE (training): {0}".format(rmse) print(msg) # グラフを表示する plt.xlabel("Number of Room") plt.ylabel("Price of House ($1,000)") plt.grid() plt.show() plt.savefig("image.png")
def main(unused_argv): # Load dataset boston = datasets.load_boston() x, y = boston.data, boston.target # Split dataset into train / test x_train, x_test, y_train, y_test = model_selection.train_test_split( x, y, test_size=0.2, random_state=42) # Scale data (training set) to 0 mean and unit standard deviation. scaler = preprocessing.StandardScaler() x_train = scaler.fit_transform(x_train) # Build 2 layer fully connected DNN with 10, 10 units respectively. feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input( x_train) regressor = tf.contrib.learn.DNNRegressor( feature_columns=feature_columns, hidden_units=[10, 10]) # Fit regressor.fit(x_train, y_train, steps=5000, batch_size=1) # Transform x_transformed = scaler.transform(x_test) # Predict and score y_predicted = list(regressor.predict(x_transformed, as_iterable=True)) score = metrics.mean_squared_error(y_predicted, y_test) print('MSE: {0:f}'.format(score))
def test_template(params={'objective': 'regression', 'metric': 'l2'}, X_y=load_boston(True), feval=mean_squared_error, num_round=100, init_model=None, custom_eval=None, early_stopping_rounds=10, return_data=False, return_model=False): params['verbose'], params['seed'] = -1, 42 X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train, params=params) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) if return_data: return lgb_train, lgb_eval evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=num_round, valid_sets=lgb_eval, valid_names='eval', verbose_eval=False, feval=custom_eval, evals_result=evals_result, early_stopping_rounds=early_stopping_rounds, init_model=init_model) if return_model: return gbm else: return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
def generate_data(case, sparse=False): # Generate regression / classification data. bunch = None if case == 'regression': bunch = datasets.load_boston() elif case == 'classification': bunch = datasets.fetch_20newsgroups_vectorized(subset='all') X, y = shuffle(bunch.data, bunch.target) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] if sparse: X_train = csr_matrix(X_train) X_test = csr_matrix(X_test) else: X_train = np.array(X_train) X_test = np.array(X_test) y_test = np.array(y_test) y_train = np.array(y_train) data = { 'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test, } return data
def test_rrf_vs_sklearn_reg(self): """Test R vs. sklearn on boston housing dataset. """ from sklearn.datasets import load_boston from sklearn.cross_validation import train_test_split from sklearn.metrics import mean_squared_error from sklearn.ensemble import RandomForestRegressor boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=13) n_samples, n_features = X_train.shape mtry = int(np.floor(0.3 * n_features)) # do 100 trees r_rf = RRFEstimatorR(**{'ntree': 100, 'nodesize': 1, 'replace': 0, 'mtry': mtry, 'corr.bias': False, 'sampsize': n_samples, 'random_state': 1234}) r_rf.fit(X_train, y_train) y_pred = r_rf.predict(X_test) r_mse = mean_squared_error(y_test, y_pred) p_rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=1, bootstrap=False, max_features=mtry, random_state=1) p_rf.fit(X_train, y_train) y_pred = p_rf.predict(X_test) p_mse = mean_squared_error(y_test, y_pred) print('%.4f vs %.4f' % (r_mse, p_mse)) # should be roughly the same (7.6 vs. 7.2) np.testing.assert_almost_equal(r_mse, p_mse, decimal=0)
def get_bar_plot(): boston = datasets.load_boston() prices = boston['target'] ys, bin_edges = np.histogram(prices, bins=10) ys = ys.astype('d') / ys.sum() xs = (bin_edges[:-1] + bin_edges[1:]) / 2.0 x, y = get_data_sources(x=xs, y=ys) x_mapper, y_mapper = get_mappers(x, y) # we need to make the range of the x coordinate a bit larger, otherwise # half of the first and last bar are cut delta = bin_edges[1] - bin_edges[0] x_mapper.range.low = xs[0] - delta / 2. x_mapper.range.high = xs[-1] + delta / 2. y_mapper.range.high += 0.02 bar_plot = BarPlot( index = x, value = y, index_mapper = x_mapper, value_mapper = y_mapper, fill_color = 'blue', bar_width = 3.0, **PLOT_DEFAULTS ) add_axes(bar_plot, x_label='Median house prices', y_label='Frequency') return bar_plot
def get_data(): data = load_boston() clf = LinearRegression() clf.fit(data.data, data.target) predicted = clf.predict(data.data) plt.figure(num=None, figsize=(14, 6), dpi=80, facecolor='w', edgecolor='k') plt.scatter(data.target, predicted) plt.plot([0, 50], [0, 50], '--k') plt.axis('tight') plt.xlabel('True price of Houses ($1000s)') plt.ylabel('Predicted price of Houses ($1000s)') img = StringIO.StringIO() plt.savefig(img,bbox_inches='tight') img.seek(0) plt.close() return img
def get_variable_size_scatter_plot(): boston = datasets.load_boston() prices = boston['target'] lower_status = boston['data'][:,-1] tax = boston['data'][:,9] x, y = get_data_sources(x=lower_status, y=prices) x_mapper, y_mapper = get_mappers(x, y) # normalize between 0 and 10 marker_size = tax / tax.max() * 10. scatter_plot = ScatterPlot( index=x, value=y, index_mapper=x_mapper, value_mapper=y_mapper, marker='circle', marker_size=marker_size, title='Size represents property-tax rate', **PLOT_DEFAULTS ) scatter_plot.color = (0.0, 1.0, 0.3, 0.4) add_axes(scatter_plot, x_label='Percent lower status in the population', y_label='Median house prices') return scatter_plot
def get_jitter_plot(): boston = datasets.load_boston() prices = boston['target'] x, y = get_data_sources(y=prices) x_mapper, y_mapper = get_mappers(x, y) jitter_plot = JitterPlot( index=y, mapper=y_mapper, marker='circle', jitter_width=100, **PLOT_DEFAULTS ) jitter_plot.line_width = 1. x_axis = PlotAxis(orientation='bottom', title='Median house prices', mapper=jitter_plot.mapper, component=jitter_plot, **AXIS_DEFAULTS) jitter_plot.underlays.append(x_axis) return jitter_plot
def load_boston(): from sklearn.datasets import load_boston boston = load_boston() # print(boston.DESCR) # print(boston.feature_names) # CRIM : 人口1人当たりの犯罪発生数 # ZN : 25,000 平方フィート以上の住居区画の占める割合 # INDUS : 小売業以外の商業が占める面積の割合 # CHAS : チャールズ川によるダミー変数 (1: 川の周辺, 0: それ以外) # NOX : NOx の濃度 # RM : 住居の平均部屋数 # AGE : 1940 年より前に建てられた物件の割合 # DIS : 5 つのボストン市の雇用施設からの距離 (重み付け済) # RAD : 環状高速道路へのアクセスしやすさ # TAX : $10,000 ドルあたりの不動産税率の総計 # PTRATIO : 町毎の児童と教師の比率 # B : 町毎の黒人 (Bk) の比率を次の式で表したもの。 1000(Bk – 0.63)^2 # LSTAT : 給与の低い職業に従事する人口の割合 (%) # pp.pprint(boston.data) # print(np.array(boston.data).shape) # pp.pprint(boston.target) # house prices X = boston.data y = boston.target return SklearnDataGenerator.shuffle(X, y)
import pandas as pd import numpy as np import matplotlib.pyplot as plt import sklearn from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split plt.style.use(style='ggplot') plt.rcParams['figure.figsize'] = (10, 6) train = load_boston(return_X_y=False) data = pd.DataFrame(data= np.c_[train['data'], train['target']]) data = data.select_dtypes(include=[np.number]).interpolate().dropna() print(data.info()) numeric_features = data.select_dtypes(include=[np.number]) corr = numeric_features.corr() print(corr) data = data.drop([2],axis=1) X = data.drop([13], axis=1) Y = data[13] #print(X) #print(Y) X_train, X_test,y_train, y_test = train_test_split( X, Y, random_state=42, test_size=.33) from sklearn import linear_model
from sklearn.datasets import load_boston import numpy as np import wandb wandb.init() # Save hyperparameters wandb.config.lr = 0.000001 wandb.config.epochs = 1 # Load Dataset data, target = load_boston(return_X_y=True) # Initialize model weights = np.zeros(data.shape[1]) bias = 0 # Train Model for _ in range(wandb.config.epochs): np.random.shuffle(data) for i in range(data.shape[0]): x = data[i, :] y = target[i] err = y - np.dot(weights, x) if (err < 0): weights -= wandb.config.lr * x bias -= wandb.config.lr else: weights += wandb.config.lr * x bias += wandb.config.lr
# coding: utf-8 # Ali Nehrani import numpy as np import pandas as pd from sklearn import (datasets, metrics, cluster, feature_selection, manifold, decomposition, preprocessing, mixture) from matplotlib import pyplot as plt from IPython.core.debugger import Tracer from sklearn.metrics import normalized_mutual_info_score import pdb # Load the boston dataset to variable boston = datasets.load_boston() resultDataFrame = pd.DataFrame(columns=['Cluster', 'NMIS Accuracy']) df = pd.DataFrame(boston.data) df.columns = boston.feature_names df['target'] = boston.target # boston data X = boston.data #print('boston data shape:', boston.data.shape) print('boston data:', X) # iris features print('boston features:', boston.feature_names) #print('boston target head', boston.data.head) # boston target
import sys,os sys.path.append(os.path.join(os.path.dirname(sys.executable),'share','pydaal_examples','examples','python','source')) from DecisionForest import Regression from utils import printNumericTable from daal.data_management import HomogenNumericTable from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split import numpy as np data = load_boston() x = data.data y = data.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.40, random_state=42) trainData = HomogenNumericTable(x_train) testData=HomogenNumericTable(x_test) nD_y_train= y_train[:,np.newaxis] trainDependentVariables= HomogenNumericTable(nD_y_train) nD_y_test = y_test[:,np.newaxis] testGroundTruth = HomogenNumericTable(nD_y_test) ''' Instantiate Decision Forest object Regression(nTrees = 100, observationsPerTreeFraction = 1,featuresPerNode=0,maxTreeDepth=0, minObservationsInLeafNodes=5,impurityThreshold=0,varImportance=None,resultsToCompute=0) ''' #Instantiate Linear Regression object daal_DF = Regression(nTrees=100,maxTreeDepth=15,resultsToCompute=3) #Training trainingResult = daal_DF.training(trainData,trainDependentVariables) #Prediction pred_nT = daal_DF.predict(trainingResult,trainData) #Serialize the training object
# -*- coding: utf-8 -*- #%% import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler import pandas as pd from tensorflow import keras #%% from sklearn.datasets import load_boston housing = load_boston() #%% from sklearn.model_selection import train_test_split x_train_all, x_test, y_train_all, y_test = train_test_split( housing.data, housing.target, random_state=7) x_train, x_valid, y_train, y_valid = train_test_split( x_train_all, y_train_all, random_state=11) #%% scaler = StandardScaler() x_train_scaled = scaler.fit_transform(x_train) x_valid_scaled = scaler.transform(x_valid) x_test_scaled = scaler.transform(x_test) #%% # subclass API class WideDeepModel(keras.models.Model): def __init__(self): """Define the model layers"""
def main(): data = load_boston() X_ = data['data'] y_ = data['target'] X_ = (X_ - np.mean(X_, axis=0)) / np.std(X_, axis=0) n_features = X_.shape[1] print "nunber of features", n_features n_hidden = 15 W1_ = np.random.randn(n_features, n_hidden) b1_ = np.random.randn(n_hidden) #b1_ = np.zeros(n_hidden) W2_ = np.random.randn(n_hidden, 1) b2_ = np.random.randn(1) #b2_ = np.zeros(1) X, y = Input(), Input() W1, b1 = Input(), Input() W2, b2 = Input(), Input() l1 = Linear(X, W1, b1) s1 = Sigmoid(l1) l2 = Linear(s1, W2, b2) cost = MSE(y, l2) feed_dict = { X: X_, y: y_, W1: W1_, b1: b1_, W2: W2_, b2: b2_ } epochs = 1000 m = X_.shape[0] batch_size = 11 steps_per_epoch = m // batch_size graph = topological_sort(feed_dict) trainables = [W1, b1, W2, b2] print("Total number of examples = {}".format(m)) loss_list = [] for i in range(epochs): loss = 0 for j in range(steps_per_epoch): # Step 1 # Randomly sample a batch of examples X_batch, y_batch = resample(X_, y_, n_samples=batch_size) # Reset value of X and y Inputs X.value = X_batch y.value = y_batch # Step 2 forward_and_backward(graph) # Step 3 sgd_update(trainables) loss += graph[-1].value print("Epoch: {}, Loss: {:.3f}".format(i+1, loss/steps_per_epoch)) loss_list.append(loss/steps_per_epoch) plt.figure() plt.plot(loss_list) plt.show()
import matplotlib.pyplot as plt from sklearn.datasets import load_boston from sklearn.feature_selection import SelectFromModel from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error #read data dataset = load_boston() X = dataset.data y = dataset.target print('data features number = ', str(X.shape[1]), ' feature') print("----------------------------------------------------------------------") #select beast features #model = LinearRegression(copy_X=True) #FeatureSelectionMethod = SelectFromModel(estimator=model) #X = FeatureSelectionMethod.fit_transform(X,y) # #print('new data features number = ',str(X.shape[1]),' feature') #print("----------------------------------------------------------------------") #normaize data normalizer = StandardScaler(copy=True, with_mean=True, with_std=True) X = normalizer.fit_transform(X) #split data to train , valid , test X_train_valid, X_test, y_train_valid, y_test = train_test_split(
random_state=0, n_nearest_features=5) iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), (iterative_impute_scores.mean(), iterative_impute_scores.std())) results_diabetes = np.array(get_results(load_diabetes())) mses_diabetes = results_diabetes[:, 0] * -1 stds_diabetes = results_diabetes[:, 1] results_boston = np.array(get_results(load_boston())) mses_boston = results_boston[:, 0] * -1 stds_boston = results_boston[:, 1] n_bars = len(mses_diabetes) xval = np.arange(n_bars) x_labels = [ 'Full data', 'Zero imputation', 'Mean Imputation', 'Multivariate Imputation' ] colors = ['r', 'g', 'b', 'orange'] # plot diabetes results plt.figure(figsize=(12, 6)) ax1 = plt.subplot(121)
# -*- coding: utf-8 -*- """ Created on Tue Aug 15 09:12:13 2017 @author: arellave """ #from sklearn import datasets import sklearn.datasets as d import numpy as np #print (datasets.load_*? #Loading a dataset boston = d.load_boston() print (boston.DESCR) housing = d.fetch_california_housing() print (housing.DESCR) X, y = boston.data, boston.target #print (datasets.make_*? #Creating a Dataset reg_data = d.make_regression() complex_reg_data = d.make_regression(1000,10,5,2,1.0) print (complex_reg_data[0].shape) classification_set = d.make_classification(weights=[0.1])
def boston_df(): boston_data = load_boston() return pd.DataFrame( data=boston_data.data, columns=boston_data.feature_names).assign(MEDV=boston_data.target)
from sklearn import datasets from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt loaded_data = datasets.load_boston() data_X = loaded_data.data # 没有括号只是属性 data_y = loaded_data.target model = LinearRegression() # 如果没有能力,就取默认参数,定义这个model model.fit(data_X, data_y) # 这是代表默认值,默认值已经很好了 print(model.predict(data_X[:4, :])) print(data_y[:4]) # 以上是用自己导入的模型,训练 X, y = datasets.make_regression(n_samples=100, n_features=1, n_targets=1, noise=1) plt.scatter(X, y) plt.show()
# Evaluate the model performance. if dataset == 'flowers': accuracy = accuracy_score(y_test, predictions) print('Classifier accuracy {:.2f}'.format(accuracy)) else: rmse = sqrt(mean_squared_error(y_test, predictions)) print('Regression root mean squared error {:.2f}'.format(rmse)) @staticmethod def get_model(dataset): """Train a classifier or a regression model with a sklearn algorithm. Note that there are MANY hyperparameters you pass into these models. Refer to the online sklearn docs for more information.""" if dataset == 'flowers': return GradientBoostingClassifier(random_state=RANDOM_SEED) else: return GradientBoostingRegressor(random_state=RANDOM_SEED) if __name__ == "__main__": # Get some sample data from sklearn datasets. Setting return_X_y to True will # constrain the output to be a tuple containing only the data and the targets. flower_data = datasets.load_iris(return_X_y=True) housing_data = datasets.load_boston(return_X_y=True) # Predict with the two models and the two datasets. predictor = StochasticGradientBoostingDemo() predictor.make_prediction(flower_data, 'flowers') predictor.make_prediction(housing_data, 'housing')
def X_boston(): return datasets.load_boston().data[rows, :2]
def y_boston(): return pd.Series(datasets.load_boston().target[rows], name='target')
#!/usr/bin/env python3 # _*_ coding:utf-8 _*_ import numpy as np import matplotlib.pyplot as plt import xgboost as xgb import graphviz from xgboost import plot_tree from sklearn.datasets import load_boston from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split data = load_boston()['data'] label = load_boston()['target'] data = np.delete(data, [1, 2, 3, 8, 9], axis=1) x_train, x_test, y_train, y_test = \ train_test_split(data, label, test_size=0.1) dtrain = xgb.DMatrix(x_train, y_train) dtest = xgb.DMatrix(x_test, y_test) ############################### origin API ########################## param = { # General Parameters 'booster': 'gbtree', # booster
#[2] folder 03_dlfs from Classes.NeuralNetwork import NeuralNetwork from Classes.SGD import SGD from Classes.Dense import Dense from Classes.Sigmoid import Sigmoid from Classes.Linear import Linear from Classes.MeanSquaredError import MeanSquaredError from Classes.Trainer import Trainer import numpy as np from numpy import ndarray ### test data ### from sklearn.datasets import load_boston boston = load_boston() data = boston.data target = boston.target # Scaling the data from sklearn.preprocessing import StandardScaler s = StandardScaler() data = s.fit_transform(data) def to_2d_np(a: ndarray, type: str = "col") -> ndarray: assert a.ndim == 1, \ "Input tensors must be 1 dimensional" if type == "col":
from tinyml.linear_model.SGDRegressor import SGDRegressor as tinymlSGDRegressor from tinyml.ensemble.GradientBoostingRegressor import GradientBoostingRegressor as tinymlGradientBoostingRegressor from tinyml.ensemble.RandomForestRegressor import RandomForestRegressor as tinymlRandomForestRegressor from tinyml.ensemble.XGBRegressor import XGBRegressor as tinymlXGBRegressor from tinyml.tree.DecisionTreeRegressor import DecisionTreeRegressor as tinymlDecisionTreeRegressor from sklearn.linear_model import LinearRegression as sklearnLinearRegression from sklearn.linear_model import SGDRegressor as sklearnSGDRegressor from sklearn.tree import DecisionTreeRegressor as sklearnDecisonTreeRegressor from sklearn.ensemble import RandomForestRegressor as sklearnRnadomForestRegressor from sklearn.ensemble import GradientBoostingRegressor as sklearnGradientBoostRegressor from xgboost import XGBRegressor if __name__ == '__main__': boston_X, boston_y = load_boston(return_X_y=True) boston_train_X, boston_test_X, boston_train_y, boston_test_y = train_test_split( boston_X, boston_y, test_size=0.3, random_state=0) data = boston_train_X, boston_train_y, boston_test_X, boston_test_y rmse_tinyml_linear_regression = train_and_eval(data, tinymlLinearRegression()) print('tinyml LinearRegression:', rmse_tinyml_linear_regression) rmse_sklearn_linear_regression = train_and_eval(data, sklearnLinearRegression()) print('sklearn LinearRegression:', rmse_sklearn_linear_regression) print('\n') std_scaler = StandardScaler() std_scaler.fit(boston_train_X)
def test_regression_boston(): boston = load_boston() data = data_df_from_bunch(boston) er = SimpleRegressor() er.fit(data, target_col='target')
from xgboost import XGBClassifier, XGBRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split import numpy as np import pandas as pd from sklearn.metrics import r2_score,accuracy_score #ㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡ# # x, y = load_boston(return_X_y=True) datasets = load_boston() x=datasets.data y=datasets['target'] x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=77) #2. 모델 model = XGBRegressor(n_estimators=1000, learning_rate=0.01, n_jobs=8) #3. 훈련 model.fit(x_train, y_train, verbose=1, eval_metric=['rmse'], eval_set=[(x_train,y_train), (x_test, y_test)], early_stopping_rounds=20) aaa = model.score(x_test, y_test) print('score : ' , aaa) y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) print('r2 : ', r2)
def test_shap_interactions(self, client: "Client") -> None: from sklearn.datasets import load_boston X, y = load_boston(return_X_y=True) params = {'objective': 'reg:squarederror'} self.run_shap_interactions(X, y, params, client)
import numpy as np import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn import datasets from sklearn.metrics import mean_squared_error, explained_variance_score from sklearn import cross_validation from sklearn.utils import shuffle # Получение процентного соотношения важности той или иной характеристики в наборе данных. Подойдёт при распределении и сопоставлении весов различных показателей # Load housing data housing_data = datasets.load_boston() # Shuffle the data X, y = shuffle(housing_data.data, housing_data.target, random_state=7) # Split data into training and testing datasets X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2, random_state=7) # AdaBoost Regressor model regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=400, random_state=7) regressor.fit(X_train, y_train) # Evaluate performance of AdaBoost regressor y_pred = regressor.predict(X_test) mse = mean_squared_error(y_test, y_pred) evs = explained_variance_score(y_test, y_pred) print('\nADABOOST REGRESSOR') print('Mean squared error =', round(mse, 2))
def data_loader() -> np.ndarray: dataset = load_boston() x = dataset.data y = dataset.target[:, np.newaxis] print("Total samples in our dataset is: {}".format(x.shape[0])) return x, y
from sklearn.datasets import fetch_20newsgroups, load_boston #news = fetch_20newsgroups(subset='all') #print(news.data) #print(news.target) lb = load_boston() print("获取特征值") print(lb.data) print("目标值") print(lb.target) print(lb.DESCR)
from sklearn.datasets import load_iris from sklearn.datasets import load_boston from sklearn import tree import pandas as pd import tree_extract_rule boston = load_boston() # Load Dataset boston_target = pd.Series(boston.target, name='target') # target_data boston_class = pd.Series() # creat Dataserie for classes for idx, i in enumerate(boston_target): if i <= 25: boston_class = boston_class.append(pd.Series(['low'], index=[idx])) if i > 25: boston_class = boston_class.append(pd.Series(['high'], index=[idx])) boston_class.name = 'target' # Dataserie with classes boston_data = pd.DataFrame( boston.data, columns=boston.feature_names) # create DataFrame out of Dataset liste = boston_data.columns # read the column names for rule_extraction blf = tree.DecisionTreeClassifier() # create the tree class_weight='balanced' blf = blf.fit(boston_data, boston_class) # train the tree rules = tree_extract_rule.extract_rules( blf, liste, boston_data, boston_class) # extract rules ,target_class='high' r = pd.DataFrame.from_dict(rules)
# -*- coding: utf-8 -*- """ Created on Wed Jan 29 11:58:46 2020 @author: utkuk """ import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_boston from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.model_selection import train_test_split X, y = load_boston(return_X_y=True) boston = load_boston() X_egitim, X_test, y_egitim, y_test = train_test_split(X, y, test_size=0.3, random_state=3) # Lineer Model lineerModel = LinearRegression() # model objesinin yaratılması lineerModel.fit(X_egitim, y_egitim) # modelin eğitim verisi kullanılarak uydurulması lineer_egitim_r2 = lineerModel.score( X_egitim, y_egitim) # modelin eğitim verisi üzerinde R^2 değeri lineer_test_r2 = lineerModel.score( X_test, y_test) # modelin test verisi üzerinde R^2 değeri print('Lineer: Egitim verisi R2 degeri ', lineer_egitim_r2) print('Lineer: Test verisi R2 degeri', lineer_test_r2)
import numpy as np from sklearn.datasets import load_boston from sklearn.model_selection import KFold from photonai.base import Hyperpipe, PipelineElement, OutputSettings X, y = load_boston(True) my_pipe = Hyperpipe( name="default_pipe", metrics=[ "mean_absolute_error", "mean_squared_error", "pearson_correlation", ], # the performance metrics of interest best_config_metric="mean_absolute_error", eval_final_performance=False, inner_cv=KFold(n_splits=10, shuffle=True, random_state=42), verbosity=2, output_settings=OutputSettings(plots=False, project_folder="./tmp/"), ) # ADD ELEMENTS TO YOUR PIPELINE my_pipe += PipelineElement("SimpleImputer", missing_values=np.nan, strategy="median") my_pipe += PipelineElement("StandardScaler") my_pipe += PipelineElement("GaussianProcessRegressor") # NOW TRAIN YOUR PIPELINE my_pipe.fit(X, y)
def load_data(): boston = datasets.load_boston() X = boston.data y = boston.target features = boston.feature_names return X, y, features
class RegModelTest(unittest.TestCase): # Setup testing data x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8]) y = np.array([-1, 0.2, 0.9, 2.1, 3.4, 4.2, 5.6, 6.5, 7.3]) X_train, X_test, y_train, y_test = train_test_split(x.reshape(-1, 1), y, random_state=1512) boston = load_boston() boston_x_train, boston_x_test, boston_y_train, boston_y_test = train_test_split( boston.data, boston.target, random_state=1512) # Setup Linear data regression_single = RegressionModel(X_train, y_train) regression_single.ls_fit() regression_boston = RegressionModel(boston_x_train, boston_y_train) regression_boston.ls_fit() # Setup Ridge data ridge_single = RegressionModel(X_train, y_train) ridge_single.ridge_fit(alpha=0.6) ridge_boston = RegressionModel(boston_x_train, boston_y_train) ridge_boston.ridge_fit(alpha=0.6) # Setup Lasso Data lasso_single = RegressionModel(X_train, y_train) lasso_single.lasso_fit(alpha=0.6) lasso_boston = RegressionModel(boston_x_train, boston_y_train) lasso_boston.lasso_fit(alpha=0.6) def test_all_fit(self): self.assertTrue(len(self.regression_single.coeffs)) self.assertTrue(len(self.regression_boston.coeffs)) self.assertTrue(len(self.ridge_single.coeffs)) self.assertTrue(len(self.ridge_boston.coeffs)) self.assertTrue(len(self.lasso_single.coeffs)) self.assertTrue(len(self.lasso_boston.coeffs)) def test_predict(self): """ Only one model (standard or ridge or lasso) is enough as the previous test checks if the coeffs array is not empty """ self.regression_single.predict(self.X_test) self.assertTrue(len(self.regression_single.y_pred)) self.regression_boston.predict(self.boston_x_test) self.assertTrue(len(self.regression_boston.y_pred)) def test_score(self): """ Similarly, score calculation is the same across all models, so one is enough. Make a NumPy copy of prediction data (as y_test is a NumPy array) and check if their shapes are equal (otherwise score function will not work) """ pred_copy_simple = np.copy(self.regression_single.y_pred) pred_copy_boston = np.copy(self.regression_boston.y_pred) self.assertEqual(pred_copy_simple.shape, self.y_test.shape) self.assertEqual(pred_copy_boston.shape, self.boston_y_test.shape)
# 회귀 from xgboost import XGBClassifier, XGBRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split import numpy as np from sklearn.feature_selection import SelectFromModel # feature 컬럼을 선택 from sklearn.metrics import r2_score, accuracy_score x, y = load_boston(return_X_y=True) # 사이킷런에서 자동으로 x, y 부여 x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66) model = XGBRegressor(n_jobs=8) model.fit(x_train, y_train) score = model.score(x_test, y_test) print('r2: ', score) thresholds = np.sort( model.feature_importances_) # 컬럼들의 값 -> sort(낮은 숫자부터 순차적으로 정렬) print(thresholds) # 이 값들 모두 합치면 1 (컬럼 13개) # r2: 0.9221188601856797 # [0.00134153 0.00363372 0.01203115 0.01220458 0.01447935 0.01479119 # 0.0175432 0.03041655 0.04246345 0.0518254 0.06949984 0.30128643 # 0.42848358]