def task1(sample_size, dimension_size): X, y = datasets.make_sparse_uncorrelated(sample_size, dimension_size) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.3, train_size=0.7) reg = linear_model.SGDRegressor(max_iter=1000) reg.fit(X_train, y_train) y_predict = reg.predict(X_test) error = metrics.mean_squared_error(y_test, y_predict) return error time_list = list() error_list = list() for r in [100, 1000, 2000]: for i in range(0, 5): start = time.time() error_list.append(task1(10000, r)) stop = time.time() time_list.append(stop - start) print( str(r) + ": Error:" + str(sum(error_list) / len(error_list)) + "-time:" + str(sum(time_list))) for r in [100000, 250000, 500000]: for i in range(0, 5): start = time.time() error_list.append(task1(10000, r)) stop = time.time() time_list.append(stop - start) print( str(r) + ": Error:" + str(sum(error_list) / len(error_list)) + "-time:" + str(sum(time_list)))
def generators_for_regression_datasets(self): """ Generators for regression sparse random linear combination of random features, with noise make_sparse_uncorrelated """ logging.debug('----------------- Generators for regression -----------') print('sparse_uncorrelated ' , datasets.make_sparse_uncorrelated())
def test_linear_regression_positive_vs_nonpositive(): # Test differences with LinearRegression when positive=False. X, y = make_sparse_uncorrelated(random_state=0) reg = LinearRegression(positive=True) reg.fit(X, y) regn = LinearRegression(positive=False) regn.fit(X, y) assert np.mean((reg.coef_ - regn.coef_)**2) > 1e-3
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == "lfw_people": X = fetch_lfw_people().data elif dataset_name == "20newsgroups": X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == "olivetti_faces": X = fetch_olivetti_faces().data elif dataset_name == "rcv1": X = fetch_rcv1().data elif dataset_name == "CIFAR": if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [ unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5) ] X = np.vstack(X1) del X1 elif dataset_name == "SVHN": if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)["X"] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == "low rank matrix": X = make_low_rank_matrix( n_samples=500, n_features=int(1e4), effective_rank=100, tail_strength=0.5, random_state=random_state, ) elif dataset_name == "uncorrelated matrix": X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == "big sparse matrix": sparsity = int(1e6) size = int(1e6) small_size = int(1e4) data = np.random.normal(0, 1, int(sparsity / 10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_openml(dataset_name, parser="auto").data return X
def task2_sampling(sample_size, dimension_size, new_sample_size): X, y = datasets.make_sparse_uncorrelated(sample_size, dimension_size) X_new, y_new = resample(X, y, n_samples=new_sample_size, replace=False) X_train, X_test, y_train, y_test = model_selection.train_test_split( X_new, y_new, test_size=0.3, train_size=0.7) start = time.time() reg = linear_model.SGDRegressor(max_iter=1000) reg.fit(X_train, y_train) y_predict = reg.predict(X_test) error = metrics.mean_squared_error(y_test, y_predict) stop = time.time() return error, stop - start
def task2_pca(sample_size, dimension_size, component_size): X, y = datasets.make_sparse_uncorrelated(sample_size, dimension_size) pca = decomposition.PCA(n_components=component_size) pca.fit(X) new_X = pca.transform(X) X_train, X_test, y_train, y_test = model_selection.train_test_split( new_X, y, test_size=0.3, train_size=0.7) start = time.time() reg = linear_model.SGDRegressor(max_iter=1000) reg.fit(X_train, y_train) y_predict = reg.predict(X_test) error = metrics.mean_squared_error(y_test, y_predict) stop = time.time() return error, stop - start
def test_linear_regression_positive_multiple_outcome(random_state=0): # Test multiple-outcome nonnegative linear regressions random_state = check_random_state(random_state) X, y = make_sparse_uncorrelated(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] ols = LinearRegression(positive=True) ols.fit(X, Y) assert ols.coef_.shape == (2, n_features) assert np.all(ols.coef_ >= 0.0) Y_pred = ols.predict(X) ols.fit(X, y.ravel()) y_pred = ols.predict(X) assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred)
def test_linear_regression_sparse_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions with sparse data random_state = check_random_state(random_state) X, y = make_sparse_uncorrelated(random_state=random_state) X = sparse.coo_matrix(X) Y = np.vstack((y, y)).T n_features = X.shape[1] ols = LinearRegression() ols.fit(X, Y) assert ols.coef_.shape == (2, n_features) Y_pred = ols.predict(X) ols.fit(X, y.ravel()) y_pred = ols.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_sparse_multiple_outcome(setup, random_state=0): # Test multiple-outcome linear regressions with sparse data random_state = check_random_state(random_state) X, y = make_sparse_uncorrelated(random_state=random_state) X = sparse.coo_matrix(X) Y = np.vstack((y, y)).T ols = LinearRegression() error_msg = re.escape("Does not support sparse input!") with pytest.raises(NotImplementedError, match=error_msg): ols.fit(X, Y) error_msg = re.escape("Does not support sparse input!") with pytest.raises(NotImplementedError, match=error_msg): ols.fit(X, y.ravel())
def test_residualize_linear(): """sanity checks on implementation""" min_dim = 6 # atleast 4+ required for make_sparse_uncorrelated max_dim = 100 for n_samples in np.random.randint(20, 500, 3): for num_confounds in np.random.randint(min_dim, max_dim, 3): train_all, train_y = make_sparse_uncorrelated( n_samples=n_samples, n_features=min_dim + num_confounds + 1) train_X, train_confounds = splitter_X_confounds(train_all, num_confounds) resid = Residualize(model='linear') resid.fit(train_X, train_confounds) residual_train_X = resid.transform(train_X, train_confounds) # residual_train_X and train_confounds must be orthogonal now! assert_almost_equal(residual_train_X.T.dot(train_confounds), 0)
def test_make_sparse_uncorrelated(): X, y = make_sparse_uncorrelated(n_samples=5, n_features=10, random_state=0) assert_equal(X.shape, (5, 10), "X shape mismatch") assert_equal(y.shape, (5,), "y shape mismatch")
def test_make_sparse_uncorrelated(): X, y = make_sparse_uncorrelated(n_samples=5, n_features=10, random_state=0) assert X.shape == (5, 10), "X shape mismatch" assert y.shape == (5, ), "y shape mismatch"
from sklearn import datasets import matplotlib.pyplot as plt # make_sparse_uncorrelated data X,y = datasets.make_sparse_uncorrelated( n_samples=100,n_features=10,random_state=None) print('X = ') print(X) print('y = ') print(y)
print("the output of make_hastie_10_2() :: ", datasets.make_hastie_10_2()) #make_moons() executed print("the output of make_moons() :: ", datasets.make_moons()) #make_multilabel_classification() executed print("the output of make_multilabel_classification() :: ", datasets.make_multilabel_classification()) #make_regression() executed print("the output of make_regression() :: ", datasets.make_regression()) #make_sparse_spd_matrix() executed print("the output of make_sparse_spd_matrix() :: ", datasets.make_sparse_spd_matrix()) #make_sparse_uncorrelated() executed print("the output of make_sparse_uncorrelated() :: ", datasets.make_sparse_uncorrelated()) #make_sparse_uncorrelated() executed print("the output of make_sparse_uncorrelated() :: ", datasets.make_sparse_uncorrelated()) #make_swiss_roll() executed print("the output of make_swiss_roll() :: ", datasets.make_swiss_roll()) #mldata_filename() executed print("the output of mldata_filename() :: ", datasets.mldata_filename('iris.txt'))
random_state = 414 saving_fig = False # set to True to save images # dataset = "synthetic_unco" # Fig a dataset = "synthetic" # Fig b if dataset is "synthetic": n_samples, n_features = (500, 5000) X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=random_state) if dataset is "synthetic_unco": n_samples, n_features = (30, 50) X, y = make_sparse_uncorrelated(n_samples=n_samples, n_features=n_features, random_state=random_state) X = X.astype(float) y = y.astype(float) X = np.asfortranarray(X) y = np.asfortranarray(y) n_samples, n_features = X.shape X = np.asfortranarray(X) y = np.asfortranarray(y) X /= np.linalg.norm(X, axis=0) y = (y - y.mean()) / y.std() X_train, X_test, y_train, y_test =\ train_test_split(X, y, test_size=0.30, random_state=random_state)
# -*- encoding: utf-8 -*- """ 8.5.1 线性回归 """ from sklearn.datasets import make_sparse_uncorrelated from sklearn.linear_model import LinearRegression, Ridge from sklearn.model_selection import train_test_split as tsplit from sklearn import metrics import matplotlib.pyplot as plt import numpy as np X, y = make_sparse_uncorrelated(n_samples=100, n_features=4) X_train, X_test, y_train, y_test = tsplit(X, y, test_size=0.1) reg = LinearRegression() # 实例化最小二乘法线性回归模型 reg.fit(X_train, y_train) # 训练 y_pred = reg.predict(X_test) # 预测 print(y_pred) # 预测结果 print(y_test) # 实际结果 print(metrics.mean_squared_error(y_test, y_pred)) # 均方误差 print(metrics.r2_score(y_test, y_pred)) # 复相关系数 print(metrics.median_absolute_error(y_test, y_pred)) # 中位数绝对误差 plt.rcParams['font.sans-serif'] = ['FangSong'] plt.rcParams['axes.unicode_minus'] = False plt.subplot(121) plt.title('残差图') plt.plot(y_pred - y_test, 'o')