def FriedmanDataset_1(): d_train = datasets.make_friedman1(240, 10, 1) d_test = datasets.make_friedman1(1000, 10) features_train = d_train[0] + np.random.normal(0, 1, 240).reshape((240, 1)) target_train = d_train[1] features_test = d_test[0] target_test = d_test[1] return features_train, target_train, features_test, target_test
def makedata(): n_points = 500 # points X, y = make_friedman1(n_samples=n_points, n_features=5, noise=1.0, random_state=100) return train_test_split(X, y, test_size=0.5, random_state=3)
def make_data_weights_biases(neurons, twolayers): X, y = make_friedman1(n_samples=1000, n_features=5, noise=0.0, random_state=None) W_0 = np.random.rand(X.shape[1], neurons) b_0 = np.zeros((1, neurons)) if twolayers: W_1 = np.random.rand(neurons, neurons) W_2 = np.random.rand(neurons, 1) b_1 = np.zeros((1, neurons)) b_2 = np.zeros((1, 1)) else: W_1 = np.random.rand(neurons, 1) b_1 = np.zeros((1, 1)) W_2 = None b_2 = None print("X rows: " + repr(X.shape[0]) + ", " + "X columns: " + repr(X.shape[1])) print("Y rows: " + repr(y.shape)) print("W_0: " + repr(W_0) + ", " + "W_1: " + repr(W_1) + "b_0: " + repr(b_0) + "b_1: " + repr(b_1)) if twolayers: print("W_0: " + repr(W_0) + ", " + "W_1: " + repr(W_1) + "W_2: " + repr(W_2) + "b_0: " + repr(b_0) + "b_1: " + repr(b_1) + "b_2: " + repr(b_2)) return X, y, W_0, W_1, W_2, b_0, b_1, b_2 else: print("W_0: " + repr(W_0) + ", " + "W_1: " + repr(W_1) + "b_0: " + repr(b_0) + "b_1: " + repr(b_1)) return X, y, W_0, W_1, b_0, b_1
def test_data(): X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=.30, random_state=0) return xtrain, xtest, ytrain, ytest
def test_regression_synthetic(): """Test on synthetic regression datasets used in Leo Breiman, `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). """ random_state = check_random_state(1) regression_params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.1, 'loss': 'ls'} # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor() clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 5.0, "Failed on Friedman1 with mse = %.4f" % mse # Friedman2 X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 1700.0, "Failed on Friedman2 with mse = %.4f" % mse # Friedman3 X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 0.015, "Failed on Friedman3 with mse = %.4f" % mse
def error_curves(estimator, parameter, parameter_values, n_repeat=100): all_train_errors = [] all_test_errors = [] for i in range(n_repeat): X, y = make_friedman1(n_samples=200) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) train_errors = [] test_errors = [] for j, p in enumerate(parameter_values): est = estimator(**{parameter: p}) est.fit(X_train, y_train) train_errors.append( mean_squared_error(y_train, est.predict(X_train))) test_errors.append(mean_squared_error(y_test, est.predict(X_test))) all_train_errors.append(train_errors) all_test_errors.append(test_errors) return all_train_errors, all_test_errors
def test_fwls_regressor(self): feature_func = lambda x: np.ones(x.shape) bclf = LinearRegression() clfs = [ RandomForestRegressor(n_estimators=50, random_state=1), GradientBoostingRegressor(n_estimators=25, random_state=1), Ridge(random_state=1) ] # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] sr = FWLSRegressor(bclf, clfs, feature_func, n_folds=3, verbose=0, oob_score_flag=True) sr.fit(X_train, y_train) mse = mean_squared_error(y_test, sr.predict(X_test)) assert_less(mse, 6.0)
def setUp(self): # Friedman1 self.X, self.y = datasets.make_friedman1(n_samples=500, random_state=1, noise=1.0) self.X_train, self.y_train = self.X[:400], self.y[:400] self.X_test, self.y_test = self.X[400:], self.y[400:]
def __init__(self, numFeatures, numSamples, randomSeed): """ :param numFeatures: total number of features to be used (at least 5) :param numSamples: number of samples in dataset :param randomSeed: random seed value used for reproducible results """ self.numFeatures = numFeatures self.numSamples = numSamples self.randomSeed = randomSeed # generate test data: self.X, self.y = datasets.make_friedman1(n_samples=self.numSamples, n_features=self.numFeatures, noise=self.NOISE, random_state=self.randomSeed) # divide the data to a training set and a validation set: self.X_train, self.X_validation, self.y_train, self.y_validation = \ model_selection.train_test_split(self.X, self.y, test_size=self.VALIDATION_SIZE, random_state=self.randomSeed) # print(self.X_train) # print(self.y_train) # np.savetxt('testX.out', (self.X_train)) # np.savetxt('testY.out', (self.y_train)) self.regressor = GradientBoostingRegressor( random_state=self.randomSeed)
def _create_test_data(self): X, y = datasets.make_friedman1(n_samples=20, random_state=13) X = pd.DataFrame(X) Y = Response.from_array(y / y.max()) Z = Partition(size=X.shape[0], folds=5, reps=1, total_size=X.shape[0]) Z.set(max_reps=1, max_folds=0) return Container(X), Y, Z
def test_fit(self): n_samples = 10000 test_size = 0.2 max_depth = 3 lr = 0.1 n_est = 100 X, y = make_friedman1(n_samples=n_samples) n, m = X.shape X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size) model = GBM(distribution="gaussian", n_estimators=n_est, learning_rate=lr, max_depth=max_depth) model.fit(X_train, y_train) y_hat = model.predict(X_test) mse_gbm = np.mean((y_test - y_hat)**2) mse_baseline = np.mean((y_test - np.mean(y_train))**2) self.assertTrue(mse_gbm < mse_baseline)
def run(): """Run profiling.""" lc = LayerGenerator().get_sequential('stack', False, False) cm = CMLog(verbose=False) cm.monitor() sleep(5) t1 = int(np.floor(perf_counter() - cm._t0) * 10) sleep(0.1) x, z = make_friedman1(int(5 * 1e6)) sleep(5) t2 = int(np.floor(perf_counter() - cm._t0) * 10) sleep(0.1) lc.fit(x, z) t3 = int(np.floor(perf_counter() - cm._t0) * 10) sleep(5) while not hasattr(cm, 'cpu'): cm.collect() sleep(1) return cm, t1, t2, t3
def test(): X, y = make_friedman1(n_samples=1000) n, m = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) reg_params = {"subsample": 0.8, "max_depth": 4} rf = RandomForest(base_estimator=RegTree, base_params=reg_params, n_estimators=20) print("\n") print("-----------------------------------------------------") # Fit rf.fit(X_train, y_train) # Predict y_hat_default = rf.predict(X_test) y_hat_script = np.zeros(y_test.shape[0]) for i, x in enumerate(X_test): y_hat_script[i] = apply_randomforest(x, rf.dump()) # Error match_rate = np.mean((y_hat_default - y_hat_script) < 1e-12) print("match_rate: {0:.5f} %".format(match_rate * 100)) print("-----------------------------------------------------") print("\n")
def main(): dir = sys.argv[1] output_csv = dir + '/friedman1/friedman1_prep.csv' names = ["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "y"] (X, y) = data.make_friedman1(n_samples=10000, random_state=123456, noise=1.0) y = np.matrix(y).T df = pd.DataFrame(np.append(X, y, axis=1), columns=names) df = scale(df)[1] # TODO Transform box-cox. lambdas = { 'x1': 0.73772299748812553, 'x10': 0.81728280581171431, 'x2': 0.80698183857607453, 'x3': 0.73814877672198154, 'x4': 0.65907211104558194, 'x5': 0.88664969513868797, 'x6': 0.78156577216859524, 'x7': 0.73707418190834051, 'x8': 0.77589583265069417, 'x9': 0.80351813801046301 } df = transform_cox(df, lambdas) df.to_csv(output_csv, index=False)
def make_regression_dataset(dataset, n_rows, n_cols): np.random.seed(137) if dataset == 'reg1': X, y = make_regression(n_rows, n_cols, n_informative=2, n_targets=1, random_state=137) elif dataset == 'reg2': X, y = make_regression(n_rows, n_cols, n_informative=2, n_targets=1, random_state=137, noise=10) elif dataset == 'Friedman': X, y = make_friedman1(n_samples=n_rows, n_features=n_cols, noise=0.0, random_state=137) else: raise ValueError('Wrong option for dataste: ', dataset) scaler = StandardScaler() X = scaler.fit_transform(X) dtype = np.float32 X = X.astype(dtype) y = y.astype(dtype) X_train, X_test, y_train, y_test = train_test_split(X, y) return X_train, X_test, y_train, y_test
def test_simulated_annealing(): """ This test creates a dataset that has 5 features that are are used to compute `y`. The remaining 5 features are independent of `y`. This test should select the first 5 feature columns used to compute `y` more than the second set of 5 independent features. """ X, y = make_friedman1(n_samples=200, n_features=10, random_state=10) N = 10 results = np.zeros((N, X.shape[1])) for n in range(0, N): results[n] = simulated_annealing(scorer, X, y, bools=True) assert results.sum(axis=0)[0] >= results.sum(axis=0)[5] assert results.sum(axis=0)[1] >= results.sum(axis=0)[6] # Omit feature 2 because weaker strength and harder to detec assert results.sum(axis=0)[3] >= results.sum(axis=0)[8] assert results.sum(axis=0)[4] >= results.sum(axis=0)[9] # Test output is non empty features = simulated_annealing(scorer, X, y) assert len(features) > 0 features = simulated_annealing(scorer, X, y, bools=True) assert len(features) > 0 # Test outputs are correct types features = simulated_annealing(scorer, X, y) assert isinstance(features[0], np.int64) features = simulated_annealing(scorer, X, y, bools=True) assert isinstance(features[0], np.bool_)
def gradient_boosting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold): #kernel: linear, poly, rbf, sigmoid, precomputed rows = 0 while rows_temp > 0: rows = rows + 1 rows_temp = rows_temp - 1 columns = 0 while columns_temp > 0: columns = columns + 1 columns_temp = columns_temp - 1 features_values = [x for x in features_values_temp] prediction_values = [y for y in prediction_values_temp] rotated = convert_list_to_matrix(features_values, rows, columns) scores = np.array(prediction_values) threshold = float(threshold) estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.) X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(X_train, y_train) mean_squared_error(y_test, est.predict(X_test))
def generate_without_missing_values(data='simple', n_samples=200, n_features=2, random_state=0): """generate canonical regression data""" assert data in ['simple', 'linear', 'quadratic', 'friedman'] np.random.seed(random_state) mean = np.ones(n_features) ro = .5 cov = ro * np.ones((n_features, n_features)) +\ (1 - ro) * np.eye(n_features) X = np.random.multivariate_normal(mean, cov, size=n_samples) epsilon = 0.1 * np.random.randn(n_samples) if data == 'simple': y = X[:, 0] + epsilon if data == 'linear': beta = [1, 2] + list(np.random.randn(n_features-2)) y = X.dot(beta) + epsilon if data == 'quadratic': y = X[:, 0] * X[:, 0] + epsilon if data == 'friedman': # X is no more gaussian here X, y = make_friedman1(n_samples=n_samples, n_features=max(5, n_features), noise=0.1, random_state=random_state) return X, y
def generate_friedman1(seed): (X, y) = data.make_friedman1(n_samples=2000, random_state=seed, noise=1.0) # transform values to DataMatrix/DataVector types X = sg.DataMatrix(X) y = sg.DataVector(y) return (X, y)
def test_regression(): X, y = make_friedman1(n_samples=100000, noise=5) n, m = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) models = { "palobst": PaloBoost( distribution="gaussian", n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5, ), "gbm": GBM( distribution="gaussian", n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5, ), "sklearn": GradientBoostingRegressor(n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5), } print("\n") print("# Test Regression") print("-----------------------------------------------------") print(" model_name train_time predict_time rmse ") print("-----------------------------------------------------") print(" {0:12} {1:12} {2:12} {3:.5f}".format("baseline", "-", "-", np.std(y_test))) for name, model in models.items(): # Fit start = time.time() model.fit(X_train, y_train) time_fit = time.time() - start # Predict start = time.time() y_hat = model.predict(X_test) time_pred = time.time() - start # Error rmse = np.sqrt(np.mean((y_test - y_hat)**2)) print(" {0:12} {1:.5f} sec {2:.5f} sec {3:.5f}".format( name, time_fit, time_pred, rmse)) print("-----------------------------------------------------") print("\n")
def test(): from sklearn.datasets import make_friedman1 from sklearn.svm import SVR X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) estimator = SVR(kernel="linear") selector = RFECVp(estimator, step=1, cv=5) selector = selector.fit(X, y) print selector.support_ # doctest: +NORMALIZE_WHITESPACE print selector.ranking_
def friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None): return datasets.make_friedman1(n_samples=n_samples, n_features=n_features, noise=noise, random_state=random_state)
def test_make_friedman1(): X, y = make_friedman1(n_samples=5, n_features=10, noise=0.0, random_state=0) assert_equal(X.shape, (5, 10), "X shape mismatch") assert_equal(y.shape, (5,), "y shape mismatch") assert_array_almost_equal( y, 10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] )
def create_complex_regression_dataset(plot=False): from sklearn.datasets import make_friedman1 X, y = make_friedman1(n_samples=100, n_features=7, random_state=0) if plot: plt.figure() plt.title("Complex regression problem with one input variable") plt.scatter(X[:, 2], y, marker="o", s=50) plt.show() return X, y
def test_rfe_importance_getter_validation(importance_getter, err_type, Selector): X, y = make_friedman1(n_samples=50, n_features=10, random_state=42) estimator = LinearSVR() log_estimator = TransformedTargetRegressor( regressor=estimator, func=np.log, inverse_func=np.exp ) with pytest.raises(err_type): model = Selector(log_estimator, importance_getter=importance_getter) model.fit(X, y)
def test_select_from_model_pls(PLSEstimator): """Check the behaviour of SelectFromModel with PLS estimators. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/12410 """ X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) estimator = PLSEstimator(n_components=1) model = make_pipeline(SelectFromModel(estimator), estimator).fit(X, y) assert model.score(X, y) > 0.5
def load_toy_dataset(): X, Y = make_friedman1(n_samples=200, n_features=15) # X = [ # [1,1,1,1,1], # [2,2,2,2,2], # [3,3,3,3,3], # ] # Y = [1.1,2.2,3.3] return np.asarray(X), np.asarray(Y)
def test_make_friedman1(): X, y = make_friedman1(n_samples=5, n_features=10, noise=0.0, random_state=0) assert_equal(X.shape, (5, 10), "X shape mismatch") assert_equal(y.shape, (5,), "y shape mismatch") assert_array_almost_equal(y, 10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \ + 10 * X[:, 3] + 5 * X[:, 4])
def get_friedman(): n_samples = 10000 noise = 5 X, y = make_friedman1(n_samples=n_samples, noise=noise) poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True) X = poly.fit_transform(X) print(X.shape) return X, y
def test_rfe_pls(ClsRFE, PLSEstimator): """Check the behaviour of RFE with PLS estimators. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/12410 """ X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) estimator = PLSEstimator(n_components=1) selector = ClsRFE(estimator, step=1).fit(X, y) assert selector.score(X, y) > 0.5
def make_sample(): """ Return (X_train, X_test, y_train, y_test) """ X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] result = (X_train, X_test, y_train, y_test) return result
def test(): X, y = make_friedman1(n_samples=10000) # X, y = make_friedman2(n_samples=100000) # X, y = make_friedman3(n_samples=100000) n, m = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) models = { "bonsai-reg": RegTree(max_depth=3), "bonsai-xgb": XGBTree(max_depth=3), "sklearn": DecisionTreeRegressor(max_depth=3), } print("\n") print("-----------------------------------------------------") print(" model_name train_time predict_time rmse ") print("-----------------------------------------------------") print(" {0:12} {1:12} {2:12} {3:.5f}".format("baseline", "-", "-", np.std(y_test))) for name, model in models.items(): # Fit start = time.time() model.fit(X_train, y_train) time_fit = time.time() - start # Predict start = time.time() y_hat = model.predict(X_test) time_pred = time.time() - start # Error rmse = np.sqrt(np.mean((y_test - y_hat)**2)) print(" {0:12} {1:.5f} sec {2:.5f} sec {3:.5f}".format( name, time_fit, time_pred, rmse)) print("-----------------------------------------------------") print("\n") print("-----------------------------------------------------") print(" model_name feature_importances_ ") print("-----------------------------------------------------") for name, model in models.items(): f_cnt = Counter( {i: v for i, v in enumerate(model.feature_importances_)}) fi = ", ".join( ["{}:{:.3f}".format(i, v) for i, v in f_cnt.most_common(4)]) print(" {0:12} {1}".format(name, fi)) print("-----------------------------------------------------") print("\n")
def test_recursive_feature_elimination(): """ This test creates a dataset that has 5 features that are are used to compute `y`. The remaining 5 features are independent of `y`. This test should select the 5 feature columns used to compute `y`. """ X, y = make_friedman1(n_samples=200, n_features=10, random_state=10) features = recursive_feature_elimination(scorer, X, y, n_features_to_select=4) assert features == [0, 1, 3, 4] # Test with n_features_to_select something other than 0.5 number # of total features to ensure logic to stop feature elimination # works correctly. features = recursive_feature_elimination(scorer, X, y, n_features_to_select=3) assert len(features) == 3 # Retest with column names X, y = make_friedman1(n_samples=200, n_features=10, random_state=10) X = pd.DataFrame(X, columns=[ 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine' ]) features = recursive_feature_elimination(scorer, X, y, n_features_to_select=4) assert features == ['zero', 'one', 'three', 'four']
def poly(): plt.figure() plt.title('Complex regression problem with one input variable') X_F1, y_F1 = make_friedman1(n_samples=100, n_features=7, random_state=0) plt.scatter(X_F1[:, 2], y_F1, marker='o', s=50) plt.show() X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1, random_state=0) linreg = LinearRegression().fit(X_train, y_train) print('linear model coeff (w): {}'.format(linreg.coef_)) print('linear model intercept (b): {:.3f}'.format(linreg.intercept_)) print('R-squared score (training): {:.3f}'.format( linreg.score(X_train, y_train))) print('R-squared score (test): {:.3f}'.format(linreg.score(X_test, y_test))) print( '\nNow we transform the original input data to add polynomial features up to degree 2 (quadratic)\n' ) poly = PolynomialFeatures(degree=2) X_F1_poly = poly.fit_transform(X_F1) X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, random_state=0) linreg = LinearRegression().fit(X_train, y_train) print('(poly deg 2) linear model coeff (w):\n{}'.format(linreg.coef_)) print('(poly deg 2) linear model intercept (b): {:.3f}'.format( linreg.intercept_)) print('(poly deg 2) R-squared score (training): {:.3f}'.format( linreg.score(X_train, y_train))) print('(poly deg 2) R-squared score (test): {:.3f}\n'.format( linreg.score(X_test, y_test))) print( '\nAddition of many polynomial features often leads to overfitting, so we often use polynomial features in combination with regression that has a regularization penalty, like ridge regression.\n' ) X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, random_state=0) linreg = Ridge().fit(X_train, y_train) print('(poly deg 2 + ridge) linear model coeff (w):\n{}'.format( linreg.coef_)) print('(poly deg 2 + ridge) linear model intercept (b): {:.3f}'.format( linreg.intercept_)) print('(poly deg 2 + ridge) R-squared score (training): {:.3f}'.format( linreg.score(X_train, y_train))) print('(poly deg 2 + ridge) R-squared score (test): {:.3f}'.format( linreg.score(X_test, y_test)))
def genFriedman(self, i=1, N=240, D=10): if i not in range(1,4): raise Exception('not a correct dataset') if i == 1: X, Y = datasets.make_friedman1(N, D ) if i == 2: X, Y = datasets.make_friedman2(N, D) if i == 3: X, Y = datasets.make_friedman3(N, D) return X, Y
def generate_baseline_data(include_cat): X, y = datasets.make_friedman1(NUM_SAMPLES, 5, 100, 1) # convert to a binomial prob = 1 / (1 + np.exp(-y)) y = np.random.binomial(1, prob) print('Event rate = {0:4.4f}'.format(np.sum(y) / NUM_SAMPLES)) data = np.hstack((y.reshape(-1, 1), X)) data = pd.DataFrame(data, columns=['y', 'x0', 'x1', 'x2', 'x3', 'x4']) if include_cat is True: data['c'] = data.apply(lambda row: 'A' if row.y == 1 else 'B', axis=1) return data
def make_sample(): """ Return (X_train, X_test, y_train, y_test) """ X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] result = ( X_train, X_test, y_train, y_test ) return result
def rf_fear_test_home(n=10,n_trees=10): cblparallel.start_port_forwarding() # Data X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] # Params #local_temp_path = os.path.abspath('../temp/') #remote_temp_path = 'python/' # Write data file locally #data_file = mkstemp_safe(cblparallel.config.LOCAL_TEMP_PATH, '.p') data_file = mkstemp_safe(cblparallel.config.HOME_TEMP_PATH, '.p') with open(data_file, 'w') as f: pickle.dump((X_train, y_train, X_test), f) # Prepare code scripts = [reduced_tree_code % {'data_file' : os.path.join(cblparallel.config.REMOTE_TEMP_PATH, os.path.split(data_file)[-1]), 'n_trees' : n_trees, 'random_state' : i * n_trees, 'output_file' : '%(output_file)s', 'flag_file' : '%(flag_file)s'} for i in range(n)] # Submit to fear with cblparallel.fear(via_gate=True) as fear: fear.copy_to(data_file, os.path.join(cblparallel.config.REMOTE_TEMP_PATH, os.path.split(data_file)[-1])) output_files = cblparallel.run_batch_on_fear(scripts, max_jobs=1000) fear.rm(os.path.join(cblparallel.config.REMOTE_TEMP_PATH, os.path.split(data_file)[-1])) # Kill local data file os.remove(data_file) # Now do something with the output estimators = [] predictions = [] for output_file in output_files: with open(output_file, 'r') as f: #(estimator, prediction) = pickle.load(f) prediction = np.genfromtxt(output_file, delimiter=',') os.remove(output_file) #estimators.append(estimator) predictions.append(prediction) #ens = EnsembleRegressor(estimators) #return RMSE(X_test, y_test, ens) ens_pred = np.mean(predictions, axis=0) return RMSE_y(y_test, ens_pred)
def rf_fear_test(n=10,n_trees=1000): # Data X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] # Params local_temp_path = os.path.abspath('../temp/') remote_temp_path = 'python/' # Write data file locally data_file = mkstemp_safe(local_temp_path, '.p') with open(data_file, 'w') as f: pickle.dump((X_train, y_train, X_test), f) # Prepare code scripts = [tree_code % {'data_file' : os.path.split(data_file)[-1], 'n_trees' : n_trees, 'random_state' : i * n_trees, 'output_file' : '%(output_file)s', 'flag_file' : '%(flag_file)s'} for i in range(n)] # Submit to fear with pyfear.fear() as fear: fear.copy_to(data_file, os.path.join(remote_temp_path, os.path.split(data_file)[-1])) output_files = pyfear.run_python_jobs(scripts, local_temp_path, remote_temp_path, fear) fear.rm(os.path.join(remote_temp_path, os.path.split(data_file)[-1])) # Kill local data file os.remove(data_file) # Now do something with the output estimators = [] predictions = [] for output_file in output_files: with open(output_file, 'r') as f: #(estimator, prediction) = pickle.load(f) prediction = np.genfromtxt(output_file, delimiter=',') os.remove(output_file) #estimators.append(estimator) predictions.append(prediction) #ens = EnsembleRegressor(estimators) #return RMSE(X_test, y_test, ens) ens_pred = np.mean(predictions, axis=0) return RMSE_y(y_test, ens_pred)
def test_staged_predict(): # Test whether staged decision function eventually gives # the same prediction. X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test = X[200:] clf = GradientBoostingRegressor() # test raise ValueError if not fitted assert_raises(ValueError, lambda X: np.fromiter(clf.staged_predict(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # test if prediction for last stage equals ``predict`` for y in clf.staged_predict(X_test): assert_equal(y.shape, y_pred.shape) assert_array_equal(y_pred, y)
def test_regression_synthetic(): # Test on synthetic regression datasets used in Leo Breiman, # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). random_state = check_random_state(1) regression_params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.1, 'loss': 'ls'} # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: clf = GradientBoostingRegressor(presort=presort) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 5.0) # Friedman2 X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: regression_params['presort'] = presort clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 1700.0) # Friedman3 X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: regression_params['presort'] = presort clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 0.015)
def test_regressor(self): X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] index = [i for i in range(200)] rf = RandomForestRegressor() jrf = JoblibedRegressor(rf, "rfr", cache_dir='') jrf.fit(X_train, y_train, index) prediction = jrf.predict(X_train, index) mse = mean_squared_error(y_train, prediction) assert_less(mse, 6.0) rf = RandomForestRegressor(n_estimators=20) jrf = JoblibedRegressor(rf, "rfr", cache_dir='') jrf.fit(X_train, y_train, index) prediction2 = jrf.predict(X_train, index) assert_allclose(prediction, prediction2)
def local_forest_test(n=10,n_trees=10): # Data X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] # Params # local_temp_path = os.path.abspath('../temp/') # remote_temp_path = 'python/' # Write data file locally data_file = mkstemp_safe(cblparallel.config.HOME_TEMP_PATH, '.p') with open(data_file, 'w') as f: pickle.dump((X_train, y_train, X_test), f) # Prepare code scripts = [reduced_tree_code % {'data_file' : data_file, 'n_trees' : n_trees, 'random_state' : i * n_trees, 'output_file' : '%(output_file)s', 'flag_file' : '%(flag_file)s'} for i in range(n)] # Run bacth in parallel) output_files = cblparallel.run_batch_locally(scripts) # Kill local data file os.remove(data_file) # Now do something with the output estimators = [] predictions = [] for output_file in output_files: with open(output_file, 'r') as f: #(estimator, prediction) = pickle.load(f) prediction = np.genfromtxt(output_file, delimiter=',') os.remove(output_file) #estimators.append(estimator) predictions.append(prediction) #ens = EnsembleRegressor(estimators) #return RMSE(X_test, y_test, ens) ens_pred = np.mean(predictions, axis=0) return RMSE_y(y_test, ens_pred)
def test_rfe_min_step(): n_features = 10 X, y = make_friedman1(n_samples=50, n_features=n_features, random_state=0) n_samples, n_features = X.shape estimator = SVR(kernel="linear") # Test when floor(step * n_features) <= 0 selector = RFE(estimator, step=0.01) sel = selector.fit(X, y) assert_equal(sel.support_.sum(), n_features // 2) # Test when step is between (0,1) and floor(step * n_features) > 0 selector = RFE(estimator, step=0.20) sel = selector.fit(X, y) assert_equal(sel.support_.sum(), n_features // 2) # Test when step is an integer selector = RFE(estimator, step=5) sel = selector.fit(X, y) assert_equal(sel.support_.sum(), n_features // 2)
def test_stacked_regressor(self): bclf = LinearRegression() clfs = [RandomForestRegressor(n_estimators=50, random_state=1), GradientBoostingRegressor(n_estimators=25, random_state=1), Ridge(random_state=1)] # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] sr = StackedRegressor(bclf, clfs, n_folds=3, verbose=0, oob_score_flag=True) sr.fit(X_train, y_train) mse = mean_squared_error(y_test, sr.predict(X_test)) assert_less(mse, 6.0)
def create_reg_syn_data(self, reps=1, rows=None): """Create synthetic data using friedman1 sample generator. Returns ------- X : pd.DataFrame The input as a pd.DataFrame. Y : np.ndarray The targets as a ndarray Z : Partition The partition object holding 5-folds. """ if rows is None: rows = 1000 X, y = make_friedman1(n_samples=rows, random_state=13) X = pd.DataFrame(data=X, columns=map(unicode, range(X.shape[1]))) Y = Response.from_array(y) Z = Partition(size=X.shape[0], folds=5, reps=reps,total_size=X.shape[0]) Z.set(max_reps=reps,max_folds=0) X = Container(dataframe=X) return X, Y, Z
def error_curves(estimator, parameter, parameter_values, n_repeat=100): all_train_errors = [] all_test_errors = [] for i in range(n_repeat): X, y = make_friedman1(n_samples=200) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) train_errors = [] test_errors = [] for j, p in enumerate(parameter_values): est = estimator(**{parameter: p}) est.fit(X_train, y_train) train_errors.append(mean_squared_error(y_train, est.predict(X_train))) test_errors.append(mean_squared_error(y_test, est.predict(X_test))) all_train_errors.append(train_errors) all_test_errors.append(test_errors) return all_train_errors, all_test_errors
from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_no_warnings from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import TransformedTargetRegressor from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression, Lasso from sklearn import datasets friedman = datasets.make_friedman1(random_state=0) def test_transform_target_regressor_error(): X, y = friedman # provide a transformer and functions at the same time regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=StandardScaler(), func=np.exp, inverse_func=np.log) assert_raises_regex(ValueError, "'transformer' and functions" " 'func'/'inverse_func' cannot both be set.", regr.fit, X, y) # fit with sample_weight with a regressor which does not support it sample_weight = np.ones((y.shape[0],)) regr = TransformedTargetRegressor(regressor=Lasso(), transformer=StandardScaler())
#!/usr/bin/env python # typical usage import supylearner as sl from sklearn import datasets, svm, linear_model, neighbors, svm import numpy as np # generate dataset np.random.seed(100) X, y = datasets.make_friedman1(1000) ols = linear_model.LinearRegression() elnet = linear_model.ElasticNetCV(l1_ratio = .1) ridge = linear_model.RidgeCV() lars = linear_model.LarsCV() lasso = linear_model.LassoCV() nn = neighbors.KNeighborsRegressor() svm1 = svm.SVR(kernel = 'rbf') svm2 = svm.SVR(kernel = 'poly') lib = [ols, elnet, ridge,lars, lasso, nn, svm1, svm2] libnames = ["OLS", "ElasticNet", "Ridge", "LARS", "LASSO", "kNN", "SVM rbf", "SVM poly"] sl_inst = sl.SuperLearner(lib, libnames, loss = "L2") sl_inst.fit(X, y) sl_inst.summarize() sl.cv_superlearner(sl_inst, X, y, K = 5)
def rf_r_test(n=10): X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] ens = EnsembleRegressor([RandomForestRegressor(n_estimators=1, max_depth=None, min_samples_split=1, random_state=i) for i in range(n)]).fit(X_train, y_train) return RMSE(X_test, y_test, ens)
def friedman1(n_samples=20000): """ Generated data """ (data, target) = datasets.make_friedman1(n_samples=n_samples) return DatasetFactory.Dataset(data=data, target=target)
from sklearn.datasets import make_friedman1 from sklearn.linear_model import LinearRegression import numpy as np import matplotlib.pyplot as plt from rfs import FFS for i in n_lst: """Return R2 Return R2 for feature ranking steps in forward feature selection """ selector = FFS(linear, i, step=1, verbose=0) selector.fit(X, y) score.append(selector.score(X, y)) if __name__ == '__main__': n_features = 100 n_samples = 5000 X, y = make_friedman1(n_samples=n_samples, n_features=n_features, random_state=0) linear = LinearRegression() score = [] n_lst = np.arange(1,20,1) plt.plot(n_lst, score, label="score") #plt.plot(test_sizes, test_error, label="test") plt.legend() plt.xlabel('number of features selected') plt.ylabel('R^2') plt.show()
def test_vs_linear_model(N=10,M=10,informative=5): #random.seed(1) def run(X,y): def fit_predict(name): lin.fit(X_train,y_train.ravel()) y_pred = lin.predict(X_test) mae = mean_absolute_error(y_test, y_pred) mse = mean_squared_error(y_test, y_pred) alpha = lin.alpha_ if 'alpha_' in dir(lin) else np.nan print "%s: mae=%f mse=%f alpha=%f" % (name,mae,mse,alpha) X_train = X[:N,:] y_train = y[:N] X_test = X[N:,:] y_test = y[N:] alphas = [10*(0.1**i) for i in range(10)] print "X_train:",X_train.shape,"X_test:",X_test.shape lin = linear_model.RidgeCV(alphas=alphas) fit_predict("ridge") lin = linear_model.LassoCV(alphas=alphas) fit_predict("lasso") lin_r = linear_model.RidgeCV(alphas=alphas).fit(X_train,y_train) lin_l = linear_model.LassoCV(alphas=alphas).fit(X_train,y_train) lin = LinearMAE(l1=lin_l.alpha_, l2=lin_r.alpha_, verbose=0, opt='CG', maxiter=300) fit_predict("LinearMAE") lin = RandomForestRegressor(n_estimators=100, max_depth = 12, n_jobs = -1, verbose = 0, random_state=3465343) fit_predict("RFRegressor") lin = GradientBoostingRegressor(n_estimators=100, loss = 'lad', verbose = 0, max_depth = 12, learning_rate = 0.1, subsample = 1.0, random_state=3465343) fit_predict("GBRegressor") #for noise in [0.01,0.1]: for noise in [0.1]: print "\nLinear Problem: noise=%.2f%%\n========" % (noise*100,) a = np.random.sample(M) N2 = N*2 X = np.reshape(np.random.sample(N2*M),(N2,M)) y = np.dot(X,a) + np.random.sample(N2)*noise run(X,y) print "\nRegression Problem: noise=%.2f%%\n========" % (noise*100,) X,y = make_regression(n_samples=N*2, n_features=M, n_informative=informative, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=noise, shuffle=True, coef=False, random_state=None) run(X,y) print "\nRegression Problem, effective_rank=5 noise=%.2f%%\n========" % (noise*100,) X,y = make_regression(n_samples=N*2, n_features=M, n_informative=informative, n_targets=1, bias=0.0, effective_rank=5, tail_strength=0.5, noise=noise, shuffle=True, coef=False, random_state=None) run(X,y) print "\nFriedman1 Problem noise=%.2f%%\n========" % (noise*100,) X,y = make_friedman1(n_samples=N*2, n_features=M, noise=noise, random_state=None) run(X,y)
def with_best_first(cls, max_leaf_nodes): return partial(cls, max_leaf_nodes=max_leaf_nodes) def uniform_dataset(args): X = np.random.random(size=(args.num_examples, args.num_features)) y = np.random.choice([-1, 1], size=args.num_examples) return (X, y) DATASETS = { "uniform": uniform_dataset, "hastie": lambda args: datasets.make_hastie_10_2( n_samples=args.num_examples), "friedman1": lambda args: datasets.make_friedman1( n_samples=args.num_examples, n_features=args.num_features), "friedman2": lambda args: datasets.make_friedman2( n_samples=args.num_examples, noise=args.noise), "friedman3": lambda args: datasets.make_friedman3( n_samples=args.num_examples, noise=args.noise), "make_regression": lambda args: datasets.make_regression( n_samples=args.num_examples, n_features=args.num_features, n_informative=args.num_informative) } ENSEMBLE_REGRESSORS = [ ("GB-D1", with_depth(ensemble.GradientBoostingRegressor, 1)), ("GB-D3", with_depth(ensemble.GradientBoostingRegressor, 3)), ("GB-B10", with_best_first(ensemble.GradientBoostingRegressor, 10)), ("RF-D1", with_depth(ensemble.RandomForestRegressor, 1)),
for t in self.trees: np.random.sample() t.fit(X, Y) def predict(self, X): Y = [] trees = self.trees for x in X: Y.append(median([t.predict_one(x) for t in trees])) return Y""" if __name__ == '__main__': from sklearn.metrics import r2_score from sklearn.datasets import make_friedman1 from sklearn.tree import DecisionTreeRegressor X, Y = make_friedman1(10000, 100) X_train, Y_train = X[:9000], Y[:9000] X_test, Y_test = X[9000:], Y[9000:] clf = Forest(50, 10, .7, 1)#Regressor(10) clf2 = RandomForestRegressor(50, max_depth=10) clf.fit(X_train, Y_train) clf2.fit(X_train, Y_train) pred = clf.predict(X_test) pred2 = clf2.predict(X_test) print r2_score(Y_test, pred) print r2_score(Y_test, pred2)
if __name__ == "__main__": pdata = np.recfromcsv('/mindhive/gablab/sad/PY_STUDY_DIR/Block/volsurf/l2output/social/split_halves/regression/lsasDELTA/6mm/allsubs.csv',names=True) subject_num = len(pdata.subject) # initialize dependent variable y = pdata.lsas_pre-pdata.lsas_post ind_variables_num = 4 #if change number here, also modify assignments below (and vice versa) # initialize design matrix X = np.zeros([subject_num,ind_variables_num]) X[:,0] = pdata.lsas_pre X[:,1] = pdata.classtype-2 X[:,2] = pdata.age X[:,3] = pdata.sex- 1 print "running FS" from sklearn.datasets import make_friedman1 X1, y1 = make_friedman1(n_samples=50, n_features=10, random_state=0) estimator = SVR(kernel="linear") selector1 = RFE(estimator, 3, step=1) selector1 = selector1.fit(X, y) selector, Reg = do_FS(X,y)
# 1.11.4.3. Fitting additional weak-learners from sklearn.metrics import mean_squared_error from sklearn.datasets import make_friedman1 from sklearn.ensemble import GradientBoostingRegressor X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(X_train, y_train) _ = est.set_params(n_estimators=200, warm_start=True) # set warm_start and new nr of trees _ = est.fit(X_train, y_train) # fit additional 100 trees to est print mean_squared_error(y_test, est.predict(X_test)) # 3.84...