def test_f_regression_select(): print "==> a lot of features" X, y = make_regression(n_samples=20000, n_features=200, n_informative=150, shuffle=False, random_state=0) idx_sel = f_regression_select(X, y, verbose=2) print "==> few ones" X, y = make_regression(n_samples=200, n_features=20, n_informative=5, noise=0.5, shuffle=False, random_state=0) idx_sel = f_regression_select(X, y, verbose=1) print "tests ok"
def test_mbsgd_regressor_default(datatype, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_mbsgd_regressor = cumlMBSGRegressor() cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() skl_sgd_regressor = SGDRegressor() skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) try: assert abs(cu_r2 - skl_r2) <= 0.02 except AssertionError: pytest.xfail("failed due to AssertionError error, " "fix will be merged soon")
def test_mbsgd_regressor(datatype, lrate, input_type, penalty, nrows, ncols): train_rows = int(nrows * 0.8) X, y = make_regression(n_samples=nrows, n_features=ncols, random_state=0) X_test = np.array(X[train_rows:, :], dtype=datatype) X_train = np.array(X[:train_rows, :], dtype=datatype) y_train = np.array(y[:train_rows, ], dtype=datatype) y_test = np.array(y[train_rows:, ], dtype=datatype) cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() skl_sgd_regressor = SGDRegressor(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test) skl_r2 = r2_score(skl_pred, y_test) assert (cu_r2 - skl_r2 <= 0.02)
def generate_datasets(n_train, n_test, n_features, noise=0.1, verbose=False): if verbose: print('Generating dataset ...') X, y, coef = make_regression(n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) random_seed = 0 X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=n_train, test_size=n_test, random_state=random_seed) X_train, y_train = shuffle(X_train, y_train, random_state=random_seed) X_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) X_test = X_scaler.transform(X_test) y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train[:, None])[:, 0] y_test = y_scaler.transform(y_test[:, None])[:, 0] gc.collect() if verbose: print('Ok') return X_train, y_train, X_test, y_test
def test_csr_sparse_center_data(): # Test output format of sparse_center_data, when input is csr X, y = make_regression() X[X < 2.5] = 0.0 csr = sparse.csr_matrix(X) csr_, y, _, _, _ = sparse_center_data(csr, y, True) assert_equal(csr_.getformat(), 'csr')
def main(args): ''' Main Function :param args: Dictionary from Parser :return: None ''' dict_arg = vars(args) algo = dict_arg['algorithm'] dark_mode = dict_arg['dark_mode'] resolution = dict_arg['resolution'] n_samples = dict_arg['n_samples'] noise = dict_arg['noise'] X, Y = make_regression(n_samples=n_samples, n_features=1, noise=noise) hash_function = { 'decisiontree': DecisionTreeRegressor(), 'adaboost': AdaBoostRegressor(), 'randomforest': RandomForestRegressor(), 'kneighbors': KNeighborsRegressor(), 'extratrees': ExtraTreesRegressor(), 'svr': SVR(kernel='linear'), 'mlp': MLPRegressor() } reg = hash_function[algo] reg.fit(X, Y) X_reshaped = np.reshape(X, (len(X))) x_best, y_best = give_best_fit(X_reshaped, reg, reso=resolution) graph(X, Y, x_best, y_best, reg, dark_mode=dark_mode)
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False): """Generate a regression dataset with the given parameters.""" if verbose: print("generating dataset...") X, y, coef = make_regression(n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) random_seed = 13 X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=n_train, random_state=random_seed) X_train, y_train = shuffle(X_train, y_train, random_state=random_seed) X_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) X_test = X_scaler.transform(X_test) y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train[:, None])[:, 0] y_test = y_scaler.transform(y_test[:, None])[:, 0] gc.collect() if verbose: print("ok") return X_train, y_train, X_test, y_test
def test_select_percentile_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the percentile heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) # Check inverse_transform respects dtype assert_array_equal(X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)))
def single_fdr(alpha, n_informative, random_state): X, y = make_regression(n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0. false_discovery_rate = (num_false_positives / (num_true_positives + num_false_positives)) return false_discovery_rate
def test_mutual_info_regression(): X, y = make_regression(n_samples=100, n_features=10, n_informative=2, shuffle=False, random_state=0, noise=10) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_regression, k=2) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_regression, percentile=20) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile', param=20).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def test_csr_preprocess_data(): # Test output format of _preprocess_data, when input is csr X, y = make_regression() X[X < 2.5] = 0.0 csr = sparse.csr_matrix(X) csr_, y, _, _, _ = _preprocess_data(csr, y, True) assert csr_.getformat() == 'csr'
def test_mbsgd_regressor(datatype, lrate, input_type, penalty, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() skl_sgd_regressor = SGDRegressor(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02
def create_regression(): x, y = make_regression( n_samples=100, n_features=1, n_informative=1, random_state=0, noise=35 ) # learning rate alpha = 1 # convergence criteria ep = 1e-12 # max iterations max_iter = 20 theta0, theta1, cost_f = gradient_descent(alpha, x, y, ep, max_iter) slope, intercept, r_value, p_value, slope_std_error = stats.linregress(x[:, 0], y) print ('intercept = %s slope = %s') % (intercept, slope) for i in range(x.shape[0]): y_predict = theta0 + theta1 * x pylab.plot(x, y, 'o') pylab.plot(x, y_predict, '-') pylab.show() print "Done."
def test_f_regression(): # Test whether the F test yields meaningful results # on a simple simulated regression problem X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) F, pv = f_regression(X, y) assert_true((F > 0).all()) assert_true((pv > 0).all()) assert_true((pv < 1).all()) assert_true((pv[:5] < 0.05).all()) assert_true((pv[5:] > 1.e-4).all()) # with centering, compare with sparse F, pv = f_regression(X, y, center=True) F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True) assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv) # again without centering, compare with sparse F, pv = f_regression(X, y, center=False) F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False) assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv)
def test_invalid_percentile(): X, y = make_regression(n_samples=10, n_features=20, n_informative=2, shuffle=False, random_state=0) assert_raises(ValueError, SelectPercentile(percentile=-1).fit, X, y) assert_raises(ValueError, SelectPercentile(percentile=101).fit, X, y) assert_raises(ValueError, GenericUnivariateSelect(mode="percentile", param=-1).fit, X, y) assert_raises(ValueError, GenericUnivariateSelect(mode="percentile", param=101).fit, X, y)
def test_mutual_info_regression(): X, y = make_regression(n_samples=100, n_features=10, n_informative=2, shuffle=False, random_state=0, noise=10) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_regression, k=2) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect( mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_regression, percentile=20) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile', param=20).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def prepare_data(mydata = True): ''' dim(X) -> (10,2) each_row(X) -> training point each_column(X) -> x_0, x_1 dim(Y) -> (10,1) each_row(Y) -> result dim(theta) ->(2,1) theta[0][0] -> x_0 theta[1][0] -> x_1 Odd Even Linked List''' if mydata: num_trainingpoint = 3 X = np.array([range(num_trainingpoint)]).T theta = np.array([[1],[2]]) x0 = np.ones(shape=(num_trainingpoint,1)) m, n = np.shape(X) X = np.c_[ np.ones(m), X] Y = X.dot(theta) else: X, Y = make_regression(n_samples=100, n_features=1, n_informative=1, random_state=0, noise=35) m, n = np.shape(X) X = np.c_[ np.ones(m), X] # insert column theta = np.ones(shape=(2,1)) return X, Y, theta
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False): """Generate a regression dataset with the given parameters.""" if verbose: print("generating dataset...") X, y, coef = make_regression(n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] idx = np.arange(n_train) np.random.seed(13) np.random.shuffle(idx) X_train = X_train[idx] y_train = y_train[idx] std = X_train.std(axis=0) mean = X_train.mean(axis=0) X_train = (X_train - mean) / std X_test = (X_test - mean) / std std = y_train.std(axis=0) mean = y_train.mean(axis=0) y_train = (y_train - mean) / std y_test = (y_test - mean) / std gc.collect() if verbose: print("ok") return X_train, y_train, X_test, y_test
def single_fdr(alpha, n_informative, random_state): X, y = make_regression( n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10, ) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0.0 false_discovery_rate = num_false_positives / (num_true_positives + num_false_positives) return false_discovery_rate
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect( f_regression, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) # Check inverse_transform respects dtype assert_array_equal(X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)))
def test_regression_squared_loss(): X, y = make_regression(n_samples=100, n_features=10, n_informative=8, random_state=0) reg = SGDRegressor(loss="squared", penalty="l2", learning_rate="constant", eta0=1e-2, random_state=0) reg.fit(X, y) pred = reg.predict(X) assert_almost_equal(np.mean((pred - y) ** 2), 4.913, 3)
def test_regression_squared_loss_multiple_output(): X, y = make_regression(n_samples=100, n_features=10, n_informative=8, random_state=0) reg = SGDRegressor(loss="squared", penalty="l2", learning_rate="constant", eta0=1e-2, random_state=0, max_iter=10) Y = np.zeros((len(y), 2)) Y[:, 0] = y Y[:, 1] = y reg.fit(X, Y) pred = reg.predict(X) assert_almost_equal(np.mean((pred - Y) ** 2), 4.541, 3)
def test_get_feature_coefficients(self): """ test select_features_by_linear_model """ X, y, _ = make_regression(n_samples=10000, n_features=100, noise=0.1, coef=True) fs = FeatureSelector(pd.DataFrame(X), pd.DataFrame(y)) actual = fs.get_feature_coefficients(norm_prior=0) print(actual)
def make_dataset(request): nrows, ncols, n_info, datatype = request.param X, y = make_regression(n_samples=nrows, n_informative=n_info, n_features=ncols, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10) return nrows, datatype, X_train, X_test, y_train, y_test
def test_get_feature_importances(self): """ test get_feature_importances """ X, y, coef = make_regression(n_samples=10000, n_features=100, noise=0.1, coef=True) fs = FeatureSelector(pd.DataFrame(X), pd.DataFrame(y)) actual = fs.get_feature_importances(n_estimators=10) print(actual)
def test_invalid_percentile(): X, y = make_regression(n_samples=10, n_features=20, n_informative=2, shuffle=False, random_state=0) assert_raises(ValueError, SelectPercentile(percentile=-1).fit, X, y) assert_raises(ValueError, SelectPercentile(percentile=101).fit, X, y) assert_raises(ValueError, GenericUnivariateSelect(mode='percentile', param=-1).fit, X, y) assert_raises(ValueError, GenericUnivariateSelect(mode='percentile', param=101).fit, X, y)
def main(): # load the dataset to the two variables X, y = make_regression(n_samples=100, n_features=1, n_informative=1, random_state=0, noise=35) m = np.shape(X)[0] X = np.c_[ np.ones(m), X] # get the slope theta = grad_desc_vector(X, y, 0.001, 1500) print theta
def make_regression_data(num_examples=100, train_test_ratio=0.5, num_features=2, sd_noise=1.0, use_feature_hashing=False, feature_bins=4, start_feature_num=1, random_state=1234567890): # use sklearn's make_regression to generate the data for us X, y, weights = make_regression(n_samples=num_examples, n_features=num_features, noise=sd_noise, random_state=random_state, coef=True) # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)] # create a list of dictionaries as the features feature_names = [ 'f{:02d}'.format(n) for n in range(start_feature_num, start_feature_num + num_features) ] features = [dict(zip(feature_names, row)) for row in X] # convert the weights array into a dictionary for convenience weightdict = dict(zip(feature_names, weights)) # split everything into training and testing portions num_train_examples = int(round(train_test_ratio * num_examples)) train_features, test_features = (features[:num_train_examples], features[num_train_examples:]) train_y, test_y = y[:num_train_examples], y[num_train_examples:] train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:] # create a FeatureHasher if we are asked to use feature hashing # with the specified number of feature bins vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hashing else None) train_fs = FeatureSet('regression_train', train_ids, labels=train_y, features=train_features, vectorizer=vectorizer) test_fs = FeatureSet('regression_test', test_ids, labels=test_y, features=test_features, vectorizer=vectorizer) return (train_fs, test_fs, weightdict)
def test_f_regression(): """ Test whether the F test yields meaningful results on a simple simulated regression problem """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) F, pv = f_regression(X, Y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.0e-4).all()
def regression(self): from sklearn.datasets.samples_generator import make_regression # X为样本特征,y为样本输出, coef为回归系数,共200个样本,每个样本1个特征 X, Y, coef = make_regression(n_samples=200, n_features=1, noise=20, coef=True) # 画图 plt.scatter(X, Y, color='orange') plt.plot(X, X * coef, color='blue', linewidth=2) plt.xticks(()) plt.yticks(()) plt.show()
def test_select_percentile_regression_full(): # Test whether the relative univariate feature selection # selects all features when '100%' is asked. X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=100).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] reg = LinearRegression() reg.fit((X), Y) assert reg.coef_.shape == (2, n_features) Y_pred = reg.predict(X) reg.fit(X, y) y_pred = reg.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_f_regression(): """ Test whether the F test yields meaningful results on a simple simulated regression problem """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) F, pv = f_regression(X, Y) assert(F > 0).all() assert(pv > 0).all() assert(pv < 1).all() assert(pv[:5] < 0.05).all() assert(pv[5:] > 1.e-4).all()
def test_linear_regression_multiple_outcome(random_state=0): "Test multiple-outcome linear regressions" X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] clf = LinearRegression(fit_intercept=True) clf.fit((X), Y) assert_equal(clf.coef_.shape, (2, n_features)) Y_pred = clf.predict(X) clf.fit(X, y) y_pred = clf.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_regression_big(): X, y = make_regression(n_samples=200000, n_features=10, n_informative=5, noise=30.0, random_state=0) X = pd.DataFrame(X) y = pd.Series(y) cls = MALSS(X, y, 'regression', n_jobs=3) cls.execute() # cls.make_report('test_regression_big') assert len(cls.algorithms) == 1 assert cls.algorithms[0].best_score is not None
def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] clf = LinearRegression(fit_intercept=True) clf.fit((X), Y) assert_equal(clf.coef_.shape, (2, n_features)) Y_pred = clf.predict(X) clf.fit(X, y) y_pred = clf.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_select_kbest_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the k best heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode="k_best", param=5).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def make_test_regression(n_features=30, n_informative=5, n_samples=5000): import pandas as pd X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, noise=0.5, shuffle=False, random_state=None) if False: idx_sel = f_regression_select(X, y, verbose=0) print("f_regression_select:", len(idx_sel), idx_sel) predictors = ["p{}".format(i) for i in range(X.shape[1])] target = 'y' df = pd.DataFrame(np.c_[X, y], columns=predictors+[target]) # print(df.head()) return df, predictors, target
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fwe", param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5,), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 2)
def test_select_percentile_regression_full(): # Test whether the relative univariate feature selection # selects all features when '100%' is asked. X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect( f_regression, mode='percentile', param=100).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def main(): # load the dataset to the two variables x, y = make_regression(n_samples=100, n_features=1, n_informative=1, random_state=0, noise=35) # criteria for the gradient descent learning_rate = 0.1 convergence_criteria = 0.01 # get the slope slope, intercept, iterations = grad_desc(x, y, learning_rate, convergence_criteria, 1000) print 'slope: ' + str(slope) print 'intercept: ' + str(intercept) print 'number of iterations: ' + str(iterations)
def test(): x, y = make_regression(n_samples=100, n_features=1, n_informative=1, random_state=0, noise=35) m, n = np.shape(x) x = np.c_[ np.ones(m), x] # insert column alpha = 0.01 # learning rate theta = gradient_descent_2(alpha, x, y, 1000) # plot for i in range(x.shape[1]): y_predict = theta[0] + theta[1]*x pylab.plot(x[:,1],y,'o') pylab.plot(x,y_predict,'k-') pylab.show() print ("Done!")
def generate_dataset(n_train,n_test,n_features,noise=0.1): X,y = make_regression(n_samples=int(n_train+n_test), n_features=int(n_features), noise=noise,random_state=101) X_train=X[:n_train] X_test = X[n_train:] y_train = y[:n_train] y_test = y[n_train:] X_scaler = sklearn.preprocessing.StandardScaler() X_train = X_scaler.fit_transform(X_train) X_test = X_scaler.transform(X_test) y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train) y_test = y_scaler.transform(y_test) return X_train,X_test,y_train,y_test
def test_select_fdr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fdr heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def main(): # generate data X, y = make_regression(n_samples=100, n_features=1, n_informative=1, random_state=0, noise=35) print("X.shape = {} \ny.shape = {}".format(X.shape, y.shape)) alpha = 0.003 ep = 0.001 theta = gradient_descent(X, y, alpha, ep) print('theta0 = {} \ntheta1 = {}'.format(theta[0], theta[1])) plot_regression_line(X, y, theta)
def test_regression_big(): X, y = make_regression(n_samples=200000, n_features=10, n_informative=5, noise=30.0, random_state=0) X = pd.DataFrame(X) y = pd.Series(y) cls = MALSS('regression').fit(X, y, 'test_regression_big') cls.generate_module_sample() from sklearn.metrics import mean_squared_error pred = cls.predict(X) print(mean_squared_error(y, pred)) assert len(cls.algorithms) == 1 assert cls.algorithms[0].best_score is not None
def make_regression_data(num_examples=100, train_test_ratio=0.5, num_features=2, sd_noise=1.0, use_feature_hashing=False, feature_bins=4, start_feature_num=1, random_state=1234567890): # use sklearn's make_regression to generate the data for us X, y, weights = make_regression(n_samples=num_examples, n_features=num_features, noise=sd_noise, random_state=random_state, coef=True) # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)] # create a list of dictionaries as the features feature_names = ['f{:02d}'.format(n) for n in range(start_feature_num, start_feature_num + num_features)] features = [dict(zip(feature_names, row)) for row in X] # convert the weights array into a dictionary for convenience weightdict = dict(zip(feature_names, weights)) # split everything into training and testing portions num_train_examples = int(round(train_test_ratio * num_examples)) train_features, test_features = (features[:num_train_examples], features[num_train_examples:]) train_y, test_y = y[:num_train_examples], y[num_train_examples:] train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:] # create a FeatureHasher if we are asked to use feature hashing # with the specified number of feature bins vectorizer = (FeatureHasher(n_features=feature_bins) if use_feature_hashing else None) train_fs = FeatureSet('regression_train', train_ids, labels=train_y, features=train_features, vectorizer=vectorizer) test_fs = FeatureSet('regression_test', test_ids, labels=test_y, features=test_features, vectorizer=vectorizer) return (train_fs, test_fs, weightdict)
def test_preprocess_copy_data_no_checks(is_sparse, to_copy): X, y = make_regression() X[X < 2.5] = 0.0 if is_sparse: X = sparse.csr_matrix(X) X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False) if to_copy and is_sparse: assert not np.may_share_memory(X_.data, X.data) elif to_copy: assert not np.may_share_memory(X_, X) elif is_sparse: assert np.may_share_memory(X_.data, X.data) else: assert np.may_share_memory(X_, X)
def test_f_regression(): # Test whether the F test yields meaningful results # on a simple simulated regression problem X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) F, pv = f_regression(X, y) assert_true((F > 0).all()) assert_true((pv > 0).all()) assert_true((pv < 1).all()) assert_true((pv[:5] < 0.05).all()) assert_true((pv[5:] > 1.e-4).all()) # again without centering, compare with sparse F, pv = f_regression(X, y, center=False) F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False) assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv)
def test_select_heuristics_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fpr, fdr or fwe heuristics """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ['fdr', 'fpr', 'fwe']: X_r2 = GenericUnivariateSelect( f_regression, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 3)
def load_data(n_samples=1024, n_features=1, n_informative=1, n_targets=1, random_state=1987, bias=13.17): """ Using sklearn package to generate (X, y, theta) where theta = (theta_0, theta_1, ..., theta_{n_features})^T are parameters of the linear model Input: See sklearn.datasets.samples_generator.make_regression for more details Output: X ~ n_samples * (n_features+1) including the addtional 1 vector y ~ n_samples * n_targets theta ~ (n_features+1) * 1 Usage: (X, y, theta) = load_data( ... ) """ from sklearn.datasets.samples_generator import make_regression X, y, theta = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_targets=n_targets, random_state=random_state, bias=bias, coef=True,) theta = np.insert(theta, 0, bias) X = np.insert(X, 0, 1, axis=1) return (X, y, theta)
def compute_bench(alpha, n_samples, n_features, precompute): lasso_results = [] lars_lasso_results = [] n_test_samples = 0 it = 0 for ns in n_samples: for nf in n_features: it += 1 print '==================' print 'Iteration %s of %s' % (it, max(len(n_samples), len(n_features))) print '==================' n_informative = nf // 10 X, Y, coef_ = make_regression(n_samples=ns, n_features=nf, n_informative=n_informative, noise=0.1, coef=True) X /= np.sqrt(np.sum(X ** 2, axis=0)) # Normalize data gc.collect() print "- benching Lasso" clf = Lasso(alpha=alpha, fit_intercept=False, precompute=precompute) tstart = time() clf.fit(X, Y) lasso_results.append(time() - tstart) gc.collect() print "- benching LassoLars" clf = LassoLars(alpha=alpha, fit_intercept=False, normalize=False, precompute=precompute) tstart = time() clf.fit(X, Y) lars_lasso_results.append(time() - tstart) return lasso_results, lars_lasso_results
from sklearn.metrics import mean_squared_error from sklearn.datasets.samples_generator import make_regression if __name__ == "__main__": list_n_samples = np.linspace(100, 10000, 5).astype(np.int) list_n_features = [10, 100, 1000] n_test = 1000 noise = 0.1 alpha = 0.01 sgd_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) elnet_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) ridge_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) for i, n_train in enumerate(list_n_samples): for j, n_features in enumerate(list_n_features): X, y, coef = make_regression( n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] print("=======================") print("Round %d %d" % (i, j)) print("n_features:", n_features) print("n_samples:", n_train) # Shuffle data idx = np.arange(n_train) np.random.seed(13)