def test_cross_val_score_with_score_func_regression(): X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0) reg = Ridge() # Default score of the Ridge regression estimator scores = cval.cross_val_score(reg, X, y, cv=5) assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # R2 score (aka. determination coefficient) - should be the # same as the default estimator score r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5) assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # Mean squared error; this is a loss function, so "scores" are negative mse_scores = cval.cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error") expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) assert_array_almost_equal(mse_scores, expected_mse, 2) # Explained variance with warnings.catch_warnings(record=True): ev_scores = cval.cross_val_score(reg, X, y, cv=5, score_func=explained_variance_score) assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict)
def test_make_regression(): X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3, effective_rank=5, coef=True, bias=0.0, noise=1.0, random_state=0) assert_equal(X.shape, (100, 10), "X shape mismatch") assert_equal(y.shape, (100,), "y shape mismatch") assert_equal(c.shape, (10,), "coef shape mismatch") assert_equal(sum(c != 0.0), 3, "Unexpected number of informative features") # Test that y ~= np.dot(X, c) + bias + N(0, 1.0). assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1) # Test with small number of features. X, y = make_regression(n_samples=100, n_features=1) # n_informative=3 assert_equal(X.shape, (100, 1))
def testParallelPen(self): #Check if penalisation == inf when treeSize < gamma numExamples = 100 X, y = data.make_regression(numExamples) learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2) paramDict = {} paramDict["setGamma"] = numpy.array(numpy.round(2**numpy.arange(1, 10, 0.5)-1), dtype=numpy.int) folds = 3 alpha = 1.0 Cvs = numpy.array([(folds-1)*alpha]) idx = Sampling.crossValidation(folds, X.shape[0]) resultsList = learner.parallelPen(X, y, idx, paramDict, Cvs) learner, trainErrors, currentPenalties = resultsList[0] learner.setGamma(2**10) treeSize = 0 #Let's work out the size of the unpruned tree for trainInds, testInds in idx: trainX = X[trainInds, :] trainY = y[trainInds] learner.learnModel(trainX, trainY) treeSize += learner.tree.size treeSize /= float(folds) self.assertTrue(numpy.isinf(currentPenalties[paramDict["setGamma"]>treeSize]).all()) self.assertTrue(not numpy.isinf(currentPenalties[paramDict["setGamma"]<treeSize]).all())
def test_partial_dependence_helpers(est, method, target_feature): # Check that what is returned by _partial_dependence_brute or # _partial_dependence_recursion is equivalent to manually setting a target # feature to a given value, and computing the average prediction over all # samples. # This also checks that the brute and recursion methods give the same # output. X, y = make_regression(random_state=0) # The 'init' estimator for GBDT (here the average prediction) isn't taken # into account with the recursion method, for technical reasons. We set # the mean to 0 to that this 'bug' doesn't have any effect. y = y - y.mean() est.fit(X, y) # target feature will be set to .5 and then to 123 features = np.array([target_feature], dtype=np.int32) grid = np.array([[.5], [123]]) if method == 'brute': pdp = _partial_dependence_brute(est, grid, features, X, response_method='auto') else: pdp = _partial_dependence_recursion(est, grid, features) mean_predictions = [] for val in (.5, 123): X_ = X.copy() X_[:, target_feature] = val mean_predictions.append(est.predict(X_).mean()) pdp = pdp[0] # (shape is (1, 2) so make it (2,)) assert_allclose(pdp, mean_predictions, atol=1e-3)
def __init__(self, n_samples, n_features, n_informative, normalize_y = False, normalize = True, centerdata = True, transformation=NullTransformation(), fit_intercept = True): self.n_samples = n_samples self.n_features = n_features X, Y = datasets.make_regression(n_samples=self.n_samples, n_features=self.n_features, n_informative=n_informative, shuffle=False, random_state=11) XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.33,random_state=0) self.XTrain_orig = XTrain self.XTest_orig = XTest self.YTrain_orig = YTrain self.YTest_orig = YTest if centerdata==True: self.XTrain, YTrain, X_mean, y_mean, X_std = center_data(XTrain, YTrain, fit_intercept=fit_intercept, normalize = normalize) self.XTest, YTest = self.center_test(XTest,YTest,X_mean,y_mean,X_std) if normalize_y: self.YTrain, self.YTest = self.normalize_labels(YTrain, YTest) else: self.YTrain = YTrain self.YTest = YTest else: self.XTrain = XTrain self.YTrain = YTrain self.XTest = XTest self.YTest = YTest self.transformation = transformation
def test_regression(): X, y = make_regression(n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=123, shuffle=False) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=123) svm = SVR(kernel='rbf') svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric='r2', num_rounds=1, seed=123) assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert imp_vals[0] > 0.2 assert imp_vals[1] > 0.2 assert sum(imp_vals[3:]) <= 0.01
def test_regression_custom_mse(): X, y = make_regression(n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=123, shuffle=False) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=123) svm = SVR(kernel='rbf', gamma='auto') svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric=mean_squared_error, num_rounds=1, seed=123) norm_imp_vals = imp_vals / np.abs(imp_vals).max() assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert norm_imp_vals[0] == -1.
def testRecursiveSetPrune(self): numExamples = 1000 X, y = data.make_regression(numExamples) y = Standardiser().normaliseArray(y) numTrain = numpy.round(numExamples * 0.66) trainX = X[0:numTrain, :] trainY = y[0:numTrain] testX = X[numTrain:, :] testY = y[numTrain:] learner = DecisionTreeLearner() learner.learnModel(trainX, trainY) rootId = (0,) learner.tree.getVertex(rootId).setTestInds(numpy.arange(testX.shape[0])) learner.recursiveSetPrune(testX, testY, rootId) for vertexId in learner.tree.getAllVertexIds(): tempY = testY[learner.tree.getVertex(vertexId).getTestInds()] predY = numpy.ones(tempY.shape[0])*learner.tree.getVertex(vertexId).getValue() error = numpy.sum((tempY-predY)**2) self.assertAlmostEquals(error, learner.tree.getVertex(vertexId).getTestError()) #Check leaf indices form all indices inds = numpy.array([]) for vertexId in learner.tree.leaves(): inds = numpy.union1d(inds, learner.tree.getVertex(vertexId).getTestInds()) nptst.assert_array_equal(inds, numpy.arange(testY.shape[0]))
def regression_data(): X, y = make_regression( 1000, 20, n_informative=10, bias=0, random_state=0) X, y = X.astype(np.float32), y.astype(np.float32).reshape(-1, 1) Xt = StandardScaler().fit_transform(X) yt = StandardScaler().fit_transform(y) return Xt, yt
def test_shuffle(): # Test that the shuffle parameter affects the training process (it should) X, y = make_regression(n_samples=50, n_features=5, n_targets=1, random_state=0) # The coefficients will be identical if both do or do not shuffle for shuffle in [True, False]: mlp1 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=shuffle) mlp2 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=shuffle) mlp1.fit(X, y) mlp2.fit(X, y) assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0]) # The coefficients will be slightly different if shuffle=True mlp1 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=True) mlp2 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=False) mlp1.fit(X, y) mlp2.fit(X, y) assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
def test_check_gcv_mode_error(mode): X, y = make_regression(n_samples=5, n_features=2) gcv = RidgeCV(gcv_mode=mode) with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): gcv.fit(X, y) with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): _check_gcv_mode(X, mode)
def test_check_gcv_mode_choice(sparse, mode, mode_n_greater_than_p, mode_p_greater_than_n): X, _ = make_regression(n_samples=5, n_features=2) if sparse: X = sp.csr_matrix(X) assert _check_gcv_mode(X, mode) == mode_n_greater_than_p assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n
def test_multioutput_regression(): # Test that multi-output regression works as expected X, y = make_regression(n_samples=200, n_targets=5) mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=50, max_iter=200, random_state=1) mlp.fit(X, y) assert_greater(mlp.score(X, y), 0.9)
def test_levenberg_marquardt(self): dataset = datasets.make_regression(n_samples=50, n_features=2) data, target = dataset data_scaler = preprocessing.MinMaxScaler() target_scaler = preprocessing.MinMaxScaler() x_train, x_test, y_train, y_test = train_test_split( data_scaler.fit_transform(data), target_scaler.fit_transform(target.reshape(-1, 1)), train_size=0.85 ) lmnet = algorithms.LevenbergMarquardt( connection=[ layers.Input(2), layers.Sigmoid(6), layers.Sigmoid(1), ], mu_update_factor=2, mu=0.1, verbose=False, show_epoch=1, ) lmnet.train(x_train, y_train, epochs=4) error = lmnet.prediction_error(x_test, y_test) self.assertAlmostEqual(0.006, error, places=3)
def regr_data(): return make_regression( n_samples=2000, n_targets=1, n_informative=10, random_state=0, )
def svm_example(n_samples = 10000, n_features = 100): from sklearn.svm import SVR from sklearn.datasets import make_regression X,Y = make_regression(n_samples, n_features) m = SVR() m.fit(X,Y)
def test_with_pandas_df(self): x, y = make_regression(random_state=561) df = pd.DataFrame(x) df['y'] = y m = ElasticNet(n_splits=3, random_state=123) m = m.fit(df.drop(['y'], axis=1), df.y) sanity_check_regression(m, x)
def test_fit_continuous(self): """ Should not allow any target type other than binary or multiclass """ X, y = make_regression() with pytest.raises(YellowbrickValueError, match="does not support target type"): oz = PrecisionRecallCurve(LinearSVC()) oz.fit(X, y)
def get_weights_regression(min_weight, max_weight): rng = np.random.RandomState(199) n = 10000 sparsity = 0.25 X, y = datasets.make_regression(n, random_state=rng) X = np.array([[np.nan if rng.uniform(0, 1) < sparsity else x for x in x_row] for x_row in X]) w = np.array([rng.uniform(min_weight, max_weight) for i in range(n)]) return X, y, w
def test_multi_target_regression_one_target(): # Test multi target regression raises X, y = datasets.make_regression(n_targets=1) X_train, y_train = X[:50], y[:50] X_test, y_test = X[50:], y[50:] rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) assert_raises(ValueError, rgr.fit, X_train, y_train)
def test_lasso_cdregressor_pickle(self): X, y = make_regression() est = _CDRegressor(fit_intercept=True) est.fit(X, y) buf = pickle.dumps(est) est2 = pickle.loads(buf) np.testing.assert_array_equal(est.coef_, est2.coef_)
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure sklearn has the same predictions as lightgbm for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and sklearn should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingRegressor( max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
def test_multioutput_regression(): """Test whether multi-output regression works as expected.""" X, y = make_regression(n_samples=200, n_targets=5, random_state=random_state) for activation in ACTIVATION_TYPES: elm = ELMRegressor(n_hidden=300, activation=activation, random_state=random_state) elm.fit(X, y) assert_greater(elm.score(X, y), 0.95)
def test_score(self): """ Assert the score method returns an R2 value """ visualizer = AlphaSelection(RidgeCV()) X, y = make_regression(random_state=352) visualizer.fit(X, y) assert visualizer.score(X, y) == pytest.approx(0.9999780266590336)
def test_dynamic_classes(self): test_classes = { algorithms.GradientDescent: {}, algorithms.MinibatchGradientDescent: {'batch_size': 10}, algorithms.Momentum: {'momentum': 0.5}, } for algorithm_class, algorithm_params in test_classes.items(): optimization_classes = [algorithms.WeightDecay, algorithms.SearchThenConverge] bpnet = algorithm_class( (3, 5, 1), addons=optimization_classes, verbose=False, **algorithm_params ) data, target = datasets.make_regression(n_features=3, n_targets=1) data = preprocessing.MinMaxScaler().fit_transform(data) target_scaler = preprocessing.MinMaxScaler() target = target_scaler.fit_transform(target.reshape(-1, 1)) with tempfile.NamedTemporaryFile() as temp: valid_class_name = bpnet.__class__.__name__ dill.dump(bpnet, temp) temp.file.seek(0) restored_bpnet = dill.load(temp) restored_class_name = restored_bpnet.__class__.__name__ temp.file.seek(0) self.assertEqual(valid_class_name, restored_class_name) self.assertEqual(optimization_classes, restored_bpnet.addons) bpnet.train(data, target, epochs=10) real_bpnet_error = bpnet.prediction_error(data, target) updated_input_weight = ( bpnet.input_layer.weight.get_value().copy() ) dill.dump(bpnet, temp) temp.file.seek(0) restored_bpnet2 = dill.load(temp) temp.file.seek(0) restored_bpnet_error = restored_bpnet2.prediction_error( data, target ) np.testing.assert_array_equal( updated_input_weight, restored_bpnet2.input_layer.weight.get_value() ) # Error must be big, because we didn't normalize data self.assertEqual(real_bpnet_error, restored_bpnet_error)
def setUp(self): super(QuickPropTestCase, self).setUp() data, target = datasets.make_regression(n_samples=1500, n_features=5, n_informative=5, n_targets=1, random_state=33) target = preprocessing.MinMaxScaler().fit_transform(target) self.data = cross_validation.train_test_split(data, target, train_size=0.75) self.connection = (5, 10, 1)
def get_sparse(): rng = np.random.RandomState(199) n = 5000 sparsity = 0.75 X, y = datasets.make_regression(n, random_state=rng) X = np.array([[0.0 if rng.uniform(0, 1) < sparsity else x for x in x_row] for x_row in X]) from scipy import sparse X = sparse.csr_matrix(X) return X, y
def test_multioutput_regression(): """Test that multi-output regression works as expected""" X, y = make_regression(n_samples=200, n_targets=5) mlp = MultilayerPerceptronRegressor(algorithm='l-bfgs', hidden_layer_sizes=50, max_iter=200, random_state=1) mlp.fit(X, y) assert_greater(mlp.score(X, y), 0.9)
def build_regression(with_preprocessor=False): """Basic array for testing when using a preprocessor""" X, y = shuffle(*make_regression(n_samples=100, n_features=5, random_state=SEED), random_state=SEED) indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int) if with_preprocessor: return Dataset(indices, y[indices], X, indices) else: return Dataset(X[indices], y[indices], None, X[indices])
def test_multi_target_regression_one_target(): # Test multi target regression raises X, y = datasets.make_regression(n_targets=1) rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) assert_raises(ValueError, rgr.fit, X, y)
if __name__ == '__main__': from sklearn.datasets import make_classification, make_regression from sklearn.metrics import mean_squared_error, accuracy_score import time from sklearn.model_selection import train_test_split for i in range(50, 400, 10): T = 1000 # X, y = make_classification(n_samples=T, n_classes=3, n_informative=4, n_features=18) # # X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3) # # t1 = time.time() # # # clf = XGBClassifier(n_estimators=55, max_depth=8,random_state=7).train(X_train,y_train,(X_test,y_test)) # print('c', i, t2 - t1) # print(accuracy_score(y_test, clf.predict(X_test))) t2 = time.time() X, y = make_regression(n_samples=T) X_train, X_test, y_train, y_test = train_test_split(X, y) clf = XGBRegressor(n_estimators=2, max_depth=30).train(X_train, y_train, (X_test, y_test)) print(mean_squared_error(y, clf.predict(X))) t3 = time.time() print('R', i, t3 - t2)
from sklearn.datasets import make_regression import matplotlib.pyplot as plt import numpy as np X, y= make_regression(n_samples=100, n_features=1, noise=0.4, bias=50) change this code def plotLine(theta0, theta1, X, y): max_x = np.max(X) + 100 min_x = np.min(X) - 100 xplot = np.linspace(min_x, max_x, 1000) yplot = theta0 + theta1 * xplot plt.plot(xplot, yplot, color='#58b970', label='Regression Line') plt.scatter(X,y) plt.axis([-10, 10, 0, 200]) plt.show() def hypothesis(theta0, theta1, x): return theta0 + (theta1*x) def cost(theta0, theta1, X, y):
from time import time from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.model_selection import ShuffleSplit from sklearn.svm import SVR from sklearn.preprocessing import StandardScaler, MinMaxScaler from my_library import print_gscv_score from my_library import print_score from my_library import yyplot print(__doc__) start = time() X, y = make_regression(n_samples=100, n_features=2, n_informative=2) scaler = MinMaxScaler() scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) mod = SVR() # search range # range_c = 2**np.arange( -5 11, dtype=float) # range_e = 2**np.arange( -10, 1, dtype=float) # range_g = 2**np.arange( -20, 11, dtype=float) # 196.29 seconds range_c = 2**np.arange(-5, 11, dtype=float) range_e = 2**np.arange(-10, 1, dtype=float)
def test_make_reg(): X, y = make_regression(1000) assert apply_toy_on(X, y)
""" Test the data module. """ import pytest import numpy as np from sklearn.datasets import make_regression from sklearn.linear_model import LinearRegression from imblearn.pipeline import Pipeline from ...preprocessing import FeatureSelector, RowSelector X, y = make_regression(n_features=10) @pytest.mark.parametrize('indices', [([0, 1, 2]), ([5]), (np.arange(0, 10))]) def test_feature_selector(indices): selector = FeatureSelector(indices=indices) X_t = selector.fit_transform(X, y) assert X_t.shape[0] == X.shape[0] assert X_t.shape[1] == len(indices) assert np.array_equal(X_t, X[:, indices]) def test_default_feature_selector(): selector = FeatureSelector() X_t = selector.fit_transform(X, y) assert np.array_equal(X_t, X) def test_feature_selector_pipeline_integration(): pipeline = Pipeline([('selector', FeatureSelector(indices=[0, 2])),
def compute_bench(samples_range, features_range): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print('====================') print('Iteration %03d of %03d' % (it, max_it)) print('====================') dataset_kwargs = { 'n_samples': n_samples, 'n_features': n_features, 'n_informative': n_features // 10, 'effective_rank': min(n_samples, n_features) / 10, #'effective_rank': None, 'bias': 0.0, } print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) X, y = make_regression(**dataset_kwargs) gc.collect() print("benchmarking lars_path (with Gram):", end='') sys.stdout.flush() tstart = time() G = np.dot(X.T, X) # precomputed Gram matrix Xy = np.dot(X.T, y) lars_path_gram(Xy=Xy, Gram=G, n_samples=y.size, method='lasso') delta = time() - tstart print("%0.3fs" % delta) results['lars_path (with Gram)'].append(delta) gc.collect() print("benchmarking lars_path (without Gram):", end='') sys.stdout.flush() tstart = time() lars_path(X, y, method='lasso') delta = time() - tstart print("%0.3fs" % delta) results['lars_path (without Gram)'].append(delta) gc.collect() print("benchmarking lasso_path (with Gram):", end='') sys.stdout.flush() tstart = time() lasso_path(X, y, precompute=True) delta = time() - tstart print("%0.3fs" % delta) results['lasso_path (with Gram)'].append(delta) gc.collect() print("benchmarking lasso_path (without Gram):", end='') sys.stdout.flush() tstart = time() lasso_path(X, y, precompute=False) delta = time() - tstart print("%0.3fs" % delta) results['lasso_path (without Gram)'].append(delta) return results
''' 선형 회귀 독립변수 - 연속형, 종속변수 - 연속형 회귀분석은 각 데이터에 대한 잔차제곱합이 최소가 되는 선형회귀식을 도출하는 방법 ''' import statsmodels.api as sm from sklearn.datasets import make_regression import numpy as np np.random.seed(8) # 방법 1 : make_regression 이용, 모델이 만들어지지는 않음, 선형회귀 연습을 위한 데이터셋 x, y, coef = make_regression(n_samples=50, n_features=1, bias=100, coef=True) # 데이터수, 독립변수, 절편, 기울기 print(x[:5]) # x가 -0.67283393 일 때 샘플 독립변수 자료 print(y[:5]) # y가 99.40304252라는 것 샘플 종속변수 자료=모델에 의해 나온 예측값 print(coef) # 0.8872285585150852 기울기 # y = wx + b 라고 할 때, 0.8872285585150852 * x + 100 = 99.40304252 yhat = 0.8872285585150852 * -0.67283393 + 100 #실제값 print('yhat :', yhat) yhat = 0.8872285585150852 * 1.1395335 + 100 print('yhat :', yhat) new_x = 0.5 pred_yhat = 0.8872285585150852 * new_x + 100 print('pred_yhat :', pred_yhat, '\n\n') #새로운 값 new_x에 대한 선형회귀 예측값 # 기존 데이터에 의해 만들어진 예측값과 실제값의 차이가 크지 않기 때문에 새로운 x에 대한 예측값을 신뢰할 수 있다고 본다. # 방법 2 : LinearRegression() 이용, 모델이 생성됨
from sklearn.exceptions import NotFittedError from sklearn.compose import make_column_transformer # To use this experimental feature, we need to explicitly ask for it: from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES from sklearn.ensemble._hist_gradient_boosting.loss import LeastSquares from sklearn.ensemble._hist_gradient_boosting.loss import BinaryCrossEntropy from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.utils import shuffle X_classification, y_classification = make_classification(random_state=0) X_regression, y_regression = make_regression(random_state=0) X_multi_classification, y_multi_classification = make_classification( n_classes=3, n_informative=3, random_state=0) def _make_dumb_dataset(n_samples): """Make a dumb dataset to test early stopping.""" rng = np.random.RandomState(42) X_dumb = rng.randn(n_samples, 1) y_dumb = (X_dumb[:, 0] > 0).astype('int64') return X_dumb, y_dumb @pytest.mark.parametrize( 'GradientBoosting, X, y', [(HistGradientBoostingClassifier, X_classification, y_classification),
3) Criar a função de erro (loss) e o otimizador 4) Criar o loop de treinamento - forward pass: calcular a predição e o erro - backward pass: calcular os gradientes - update weights: ajuste dos pesos do modelo ''' #preparar dados import torch import torch.nn as nn import numpy as np from sklearn import datasets import matplotlib.pyplot as plt X_np, y_np = datasets.make_regression(n_samples=100, n_features=1, noise=20, random_state=1) X = torch.from_numpy(X_np.astype(np.float32)) y = torch.from_numpy(y_np.astype(np.float32)) y = y.view(y.shape[0], 1) #Criar modelo n_sample, n_features = X.shape model = nn.Linear(n_features, n_features) #Função de erro criterion = nn.MSELoss() #Otimizador
from sklearn import datasets import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D # make_regression data X, y = datasets.make_regression(n_samples=100, n_features=2, n_informative=10, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None) print('X = ') print(X) print('y = ') print(y) fig = plt.figure() ax = Axes3D(fig) ax.set_xlabel("X1") ax.set_ylabel("X2") ax.set_zlabel("X3") ax.plot(X[:, 0], X[:, 1], y, marker="o", linestyle='None') plt.show()
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.datasets import make_regression from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.optimizers import Adam X, y = make_regression(n_samples=100, n_features=1, n_informative=1, effective_rank=1, tail_strength=1, noise=3, random_state=1) X_scal = StandardScaler().fit_transform(X) y_scal = StandardScaler().fit_transform(y.reshape(-1, 1)) X_train, X_test, y_train, y_test = train_test_split(X_scal, y_scal, test_size=0.3, random_state=1) model = Sequential() model.add(Dense(10, activation='linear', input_dim=X.shape[1])) model.add(Dense(1, activation='linear'))
# %% [markdown] # ## LinReg with PyTorch and Gradent Descent # # Previously, we had to do some math to calculate the optimal $\hat\beta$. # PyTorch calculates the gradients for us automatically (more on that later) # and we can use some version of gradient desctent to find our $\hat\beta$. # %% from sklearn.datasets import make_regression n_features = 1 n_samples = 100 X, y = make_regression( n_samples=n_samples, n_features=n_features, noise=10, ) dom_np = np.linspace(X.min(), X.max(), 20) dom = torch.from_numpy(dom_np).unsqueeze(-1).float() fix, ax = plt.subplots() ax.plot(X, y, ".") # %% X = torch.from_numpy(X).float() y = torch.from_numpy(y).float().unsqueeze(-1) X.shape, y.shape # %%
from sklearn.tree.tests.test_tree import assert_is_subtree # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] # (X, y), n_targets <-- as expected in the output of partial_dep() binary_classification_data = (make_classification(n_samples=50, random_state=0), 1) multiclass_classification_data = (make_classification(n_samples=50, n_classes=3, n_clusters_per_class=1, random_state=0), 3) regression_data = (make_regression(n_samples=50, random_state=0), 1) multioutput_regression_data = (make_regression(n_samples=50, n_targets=2, random_state=0), 2) # iris iris = load_iris() @pytest.mark.filterwarnings("ignore:A Bunch will be returned") @pytest.mark.parametrize('Estimator, method, data', [ (GradientBoostingClassifier, 'auto', binary_classification_data), (GradientBoostingClassifier, 'auto', multiclass_classification_data), (GradientBoostingClassifier, 'brute', binary_classification_data), (GradientBoostingClassifier, 'brute', multiclass_classification_data), (GradientBoostingRegressor, 'auto', regression_data), (GradientBoostingRegressor, 'brute', regression_data),
def test_group_lasso_paspal(): """Test function for the module.""" from sklearn.datasets import make_regression X, y, coef = make_regression(n_features=10, coef=True, n_informative=5) group_lasso_overlap_paspal(X, y, np.ones(10), 0.1)
------------------------------------------------- File Name:class Description : lightGBM 回归实例 with early-stopping sklearn API Email : [email protected] Date:2018/3/20 """ from lightgbm import LGBMRegressor from sklearn.datasets import make_regression from sklearn.metrics import mean_squared_error from sklearn.model_selection import GridSearchCV, train_test_split # 获取数据 data, target = make_regression(n_samples=1000, n_features=10, n_targets=1, n_informative=8, noise=0.1, random_state=12, bias=1.2) # 分割数据集 X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2) # 模型构建 gbm = LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=500)
# -*- coding: utf-8 -*- """ Created on Thu Aug 17 11:24:21 2017 @author: arellave """ from sklearn.datasets import make_regression import numpy as np reg_data, reg_target = make_regression(n_samples=200, n_features=500, n_informative=10, noise=2) from sklearn.linear_model import Lars lars = Lars(n_nonzero_coefs=10) lars.fit(reg_data, reg_target) print(np.sum(lars.coef_ != 0)) train_n = 100 lars_12 = Lars(n_nonzero_coefs=12) lars_12.fit(reg_data[:train_n], reg_target[:train_n]) lars_500 = Lars() lars_500.fit(reg_data[:train_n], reg_target[:train_n]) #Printing squared error print( np.mean(
# calculate akaike information criterion for a linear regression model from math import log from sklearn.datasets import make_regression from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error # calculate aic for regression def calculate_aic(n, mse, num_params): aic = n * log(mse) + 2 * num_params return aic # generate dataset X, y = make_regression(n_samples=100, n_features=2, noise=0.1) # define and fit the model on all data model = LinearRegression() model.fit(X, y) # number of parameters num_params = len(model.coef_) + 1 print('Number of parameters: %d' % (num_params)) # predict the training set yhat = model.predict(X) # calculate the error mse = mean_squared_error(y, yhat) print('MSE: %.3f' % mse) # calculate the aic aic = calculate_aic(len(y), mse, num_params) print('AIC: %.3f' % aic)
import numpy as np from sklearn import datasets import matplotlib.pyplot as plt X, y = datasets.make_regression( n_samples=60, n_features=1, noise=10, random_state=42) # put 1 for bias X = np.concatenate([np.ones((60, 1)), X], -1) X_train, y_train = X[:50], y[:50] X_test, y_test = X[50:], y[50:] # linear regression ### fill in this part ### w = # compute test error pred_test = np.dot(X_test, w) test_err = ((y_test - pred_test)**2).mean() print('MSE: %.4f' % (test_err)) # plot X_ = np.linspace(X.min(), X.max(), 100) X_ = np.concatenate([np.ones((100, 1)), X_[:,None]], -1) plt.plot(X_[:,1], np.dot(X_, w), 'k') plt.scatter(X_train[:,1], y_train, edgecolor='b', facecolor='white', label='train') plt.scatter(X_test[:,1], y_test, edgecolor='r', facecolor='white', label='test')
import numpy as np from matplotlib import pyplot as plt from sklearn import linear_model, datasets n_samles = 1000 n_outliers = 50 X, y, coef = datasets.make_regression(n_samples=n_samles, n_features=1, n_informative=1, noise=10, coef=True, random_state=0) # Add outlier data np.random.seed(0) X[:n_outliers] = 3 + 0.5 * np.random.normal(size=(n_outliers, 1)) y[:n_outliers] = -3 + 10 * np.random.normal(size=n_outliers) # Fit line using all data lr = linear_model.LinearRegression() lr.fit(X, y) # Robustly fit linear model with RANSAC algorithm ransac = linear_model.RANSACRegressor() ransac.fit(X, y) inlier_mask = ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) # Predict data of estimated models
feature_names_fruits = ['height', 'width', 'mass', 'color_score'] X_fruits = fruits[feature_names_fruits] y_fruits = fruits['fruit_label'] target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon'] X_fruits_2d = fruits[['height', 'width']] y_fruits_2d = fruits['fruit_label'] # synthetic dataset for simple regression from sklearn.datasets import make_regression plt.figure() plt.title('Sample regression problem with one input variable') X_R1, y_R1 = make_regression(n_samples=100, n_features=1, n_informative=1, bias=150.0, noise=30, random_state=0) plt.scatter(X_R1, y_R1, marker='o', s=50) plt.show() # synthetic dataset for more complex regression from sklearn.datasets import make_friedman1 plt.figure() plt.title('Complex regression problem with one input variable') X_F1, y_F1 = make_friedman1(n_samples=100, n_features=7, random_state=0) plt.scatter(X_F1[:, 2], y_F1, marker='o', s=50) plt.show()
"font.serif": ["Times", "Palatino", "serif"] }) plt.rcParams["text.usetex"] = True set_style() random_state = 414 saving_fig = False # set to True to save images # dataset = "synthetic_unco" # Fig a dataset = "synthetic" # Fig b if dataset is "synthetic": n_samples, n_features = (500, 5000) X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=random_state) if dataset is "synthetic_unco": n_samples, n_features = (30, 50) X, y = make_sparse_uncorrelated(n_samples=n_samples, n_features=n_features, random_state=random_state) X = X.astype(float) y = y.astype(float) X = np.asfortranarray(X) y = np.asfortranarray(y) n_samples, n_features = X.shape X = np.asfortranarray(X)
from sklearn.datasets import make_regression from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras import utils # Creating Dataset X, y = make_classification(n_samples=1000, n_features=4, n_informative=2, n_redundant=0, random_state=0, shuffle=False) X2, y2 = make_regression(n_samples=1000, n_features=4, n_informative=2, random_state=0, shuffle=False) """ Notes: Neural Networks perform best when the data is normlized """ #-----------------------------------------------------------------------------# # Neural Network: Binary Class # #-----------------------------------------------------------------------------# # Initializing Model model = Sequential() # Input Layer & Hidden Layer 1
## Terminology # Univerariate means there's only one variable (aka feature) # X is usually the independent variable and is often capitalized to denote such ## Typical dependencies from sklearn.datasets import make_regression # imports the library needed to make a data sample ## How to create a random dataset from sklearn.datasets import make_regression # imports the library needed to make a data sample X, y = make_regression(n_samples = 20, n_features=1, random_state=0, noise=4, bias=100.0) # X and y are the two variables. Because X is capitalized, it is probably the Independent variable #n_sample is how many samples will be created in this dataset #n_features is the number of dependent variables. (X is an independent variable and is not counted) # random_state is like random, but it seeds a number so that you can create the same dataset in the future. This is useful in academia where you need the exact same dataset to ensure reproducibility #noise determines how far from the points will be from the line. The higher the noise, the further from the line the dots will be (aka variance) #bias is where the Y-intercept will be (at x= 0) plt.scatter(X,y) # is a good way to visualize youre dataset before usage. ## how to make an S-Curve dataset from sklearn.datasets import make_s_curve data, color = make_s_curve(100, random_state=0) plt.scatter(data[:,0], color) # To see the data ## How to make a LinearRegression model against data from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X, y) ## once we assign the Linear Regression function to the variable, we use it to fit the data # You can view variable associated with the fit. Note the _ in the suffix. That means it's only available after the model has been fit to the data
Created on Sun Jul 14 10:12:41 2019 @author: Snake """ from __future__ import print_function from sklearn import datasets from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt loaded_data = datasets.load_boston() data_X = loaded_data.data data_y = loaded_data.target model = LinearRegression() model.fit(data_X, data_y) print(model.predict(data_X[:4, :])) print(data_y[:4]) X, y = datasets.make_regression(n_samples=100, n_features=1, n_targets=1, noise=10) model2 = LinearRegression() model2.fit(X, y) y_predict = model2.predict(X) plt.scatter(X, y) plt.plot(X, y_predict, color='r') plt.show()
## 5. Generating Regression Data ## from sklearn.datasets import make_regression import pandas as pd data = make_regression(n_samples=100, n_features=3, random_state=1) features = pd.DataFrame(data[0]) labels = pd.Series(data[1]) ## 6. Fitting A Linear Regression Neural Network ## from sklearn.datasets import make_regression import numpy as np data = make_regression(n_samples=100, n_features=3, random_state=1) features = pd.DataFrame(data[0]) labels = pd.Series(data[1]) features['bias'] = 1 from sklearn.linear_model import SGDRegressor def train(features, labels): lr = SGDRegressor() lr.fit(features, labels) weights = lr.coef_ return weights def feedforward(features, weights): predictions = np.dot(features, weights.T) return predictions
import numpy as np import torch import torch.nn as nn from sklearn import datasets import matplotlib.pyplot as plt ''' 1. Prepare Data 2. Create model 3. Loss and optimizer 4. Traning loop ''' # Let's prepare regression dataset X_numpy, Y_numpy = datasets.make_regression(n_samples=100, n_features=1, random_state=1, noise=20) X = torch.from_numpy(X_numpy.astype(np.float32)) Y = torch.from_numpy(Y_numpy.astype(np.float32)) Y = Y.view(Y.shape[0], 1) n_samples, n_features = X.shape # Let's define the model input_size = n_features output_size = 1 class LinearRegression(nn.Module): def __init__(self, input_dim, output_dim): super(LinearRegression, self).__init__()
import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split # Create a data set for analysis x, y = make_regression(n_samples=500, n_features = 1, noise=25, random_state=0) # Split the data set into testing and training data x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0) # Plot the data sns.set_style("darkgrid") sns.regplot(x_test, y_test, fit_reg=False) # Remove ticks from the plot plt.xticks([]) plt.yticks([]) plt.tight_layout() plt.show()
from __future__ import print_function from threading import Thread, current_thread from functools import wraps import sys import warnings from sklearn.datasets import make_classification, make_regression from .export_utils import expr_to_tree, generate_pipeline_code from deap import creator # generate a small data set for a new pipeline, in order to check if the pipeline # has unsuppported combinations in params pretest_X, pretest_y = make_classification(n_samples=50, n_features=10, random_state=42) pretest_X_reg, pretest_y_reg = make_regression(n_samples=50, n_features=10, random_state=42) def convert_mins_to_secs(time_minute): """Convert time from minutes to seconds""" # time limit should be at least 1 second return max(int(time_minute * 60), 1) class TimedOutExc(RuntimeError): """ Raised when a timeout happens """
def test_regressor_chains(): X_reg, y_reg = make_regression(random_state=112, n_targets=3, n_samples=5150) stream = DataStream(X_reg, y_reg) stream.prepare_for_use() estimator = SGDRegressor(random_state=112, max_iter=10) learner = RegressorChain(base_estimator=estimator, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(list(learner.predict(X)[0])) true_labels.append(y[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = [[-21.932581119953333, 1265662295936.5574, 7.5406725414072326e+22], [-97.17297744582125, 5438576501559.791, -1.1370581201037737e+24], [-60.06308622605051, 26421144038311.047, 1.3207650552720094e+25], [-285.32687352244847, 8881551118262.033, -1.1322856827798374e+24], [-115.80322693771457, -24997431307818.508, 2.85747306174037e+24], [-12.184193815918672, 3510562166726.0283, -4.8590562435597834e+23], [-94.99008392491476, 4794062761133.606, -1.8849188211946465e+24], [66.35576182871232, -8147485653396.883, -7.492944375995595e+23], [-52.145505628056995, -1013810481101.9043, -4.5310283013446384e+23], [16.715060622072958, 562391244392.6193, 3.3789644409962397e+22], [96.32219400190282, -20397346086007.85, 1.558245298240083e+24], [-281.8168065846582, 118681520215938.52, 4.815807486956294e+25], [-135.62679760307105, 20260866750185.832, 1.605753540523006e+24], [0.07932047636460954, -708539394047.3298, -3.61482684929158e+22], [-292.1646176261883, -11162615183157.55, -8.674643964570704e+23], [-176.92746747754094, -29231218161585.13, 1.411600743825668e+24], [-348.0498644784687, -100615393132365.25, 9.759683002046948e+23], [30.948974669258675, -1199287119275.6328, 2.0866927007519847e+23], [214.0020659569134, -24437173206276.543, 9.450880718880671e+23], [153.98931593720746, 32675842205528.723, -1.7246747286222668e+24], [99.39074016354951, -11385065116243.611, 1.0770253102805811e+24], [127.81660709796127, 16929726964275.697, 7.14820947257164e+24], [40.45505653639006, -14311951591200.725, -9.33193290094133e+23], [117.52219878440611, 17952367624051.36, 4.5651719663788677e+23], [75.53942801239991, -9231543699137.594, 3.2317133158453914e+24], [31.795193207760704, -4084783706153.4004, -4.188095047309216e+23], [68.5318978502461, 5735810247065.921, 1.7284713503779943e+24], [65.18438567482129, -13298743450357.943, -1.4367047198923567e+24], [-116.63952028337805, -344127767223.9295, 2.3925104169428623e+22], [-76.81599010889556, 8711205431447.733, -1.1575305916673031e+24], [263.1077717649874, 32146618104196.434, -7.240279466740839e+24], [-94.07597099457413, -8216681977657.527, 2.3785728690780553e+24], [-175.78429788635424, -368856885004.46, -5.7200993095587195e+22], [59.648477499483285, -1752783828320.242, 2.1429953624557326e+23], [71.68447202426032, -27151271800666.492, 9.367463190825582e+24], [-189.96629636835922, -27090727476080.18, -3.8659883994544866e+24], [-240.7920206809074, 15406047062899.537, 2.0609123388035027e+24], [-105.80996634043589, -1518636404558.1646, -1.4166487855869706e+23], [-164.02527753963858, -61386039046571.125, -2.179071650432624e+25], [52.451759456657975, -988509747123.6125, -7.334899319683594e+22], [68.37044139814127, -7434200892467.581, -7.535677215142279e+23], [164.9457843624521, -9474550940989.51, -1.3512944635293625e+24], [189.34401690407307, -14349556896444.508, 1.0732760415617274e+24], [0.8944005517286119, 463945767759.78735, -1.9938544157612443e+22], [71.7856433565235, -9804063257174.584, 4.7874862540754335e+23], [-5.450502769025279, 281585481223.33276, 2.1974700575843552e+22], [248.00190755589915, -81874135462745.58, -2.6532557110860303e+25], [-113.86249490223707, 2634310697909.643, 1.580428629322546e+23], [-35.92856878407447, -5410985463428.589, 2.522168862637753e+23]] print(predictions) assert np.allclose(np.array(predictions).all(), np.array(expected_predictions).all()) assert type(learner.predict(X)) == np.ndarray