def test_load_diabetes(): res = load_diabetes() assert_equal(res.data.shape, (442, 10)) assert_true(res.target.size, 442) # test return_X_y option X_y_tuple = load_diabetes(return_X_y=True) bunch = load_diabetes() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target)
def ModelSelectionTest01(): from sklearn import datasets, svm import numpy as np digits = datasets.load_digits() X_digits = digits.data Y_digits = digits.target svc = svm.SVC(C = 1, kernel = 'linear') score = svc.fit(X_digits[:-100], Y_digits[:-100]).score(X_digits[-100:], Y_digits[-100:]) #print score X_folds = np.array_split(X_digits, 3) Y_folds = np.array_split(Y_digits, 3) #print len(X_folds[0]) scores = list() for k in range(3): X_train = list(X_folds) #这里的X_folds是一个具有3个元素的list X_test = X_train.pop(k) #test是train的第K个元素 X_train = np.concatenate(X_train) #这里是把X_train减去X_test #print len(X_train) Y_train = list(Y_folds) Y_test = Y_train.pop(k) Y_train = np.concatenate(Y_train) scores.append(svc.fit(X_train, Y_train).score(X_test, Y_test)) #print scores from sklearn import cross_validation k_fold = cross_validation.KFold(n = 6, n_folds = 3) for train_indices, test_indices in k_fold: print train_indices, test_indices k_fold = cross_validation.KFold(len(X_digits), n_folds = 3) scores = [svc.fit(X_digits[train], Y_digits[train]).score(X_digits[test], Y_digits[test]) for train , test in k_fold] #print scores scores = cross_validation.cross_val_score(svc, X_digits, Y_digits, cv = k_fold, n_jobs = 1) #print scores from sklearn.grid_search import GridSearchCV gammas = np.logspace(-6, -1, 10) clf = GridSearchCV(estimator = svc, param_grid = dict(gamma = gammas), n_jobs = 1) clf.fit(X_digits[:1000], Y_digits[:1000]) print clf.best_score_ print clf.best_estimator_.gamma from sklearn import linear_model, datasets lasso = linear_model.LassoCV() #这里的lassoCV和lasso有什么区别? diabetes = datasets.load_diabetes() X_diabetes = diabetes.data Y_diabetes = diabetes.target lasso.fit(X_diabetes, Y_diabetes) print lasso.alpha_
def supervisedTest02(): import numpy as np from sklearn import datasets diabetes = datasets.load_diabetes() diabetes_X_train = diabetes.data[:-20] diabetes_X_test = diabetes.data[-20:] diabetes_Y_train = diabetes.target[:-20] diabetes_Y_test = diabetes.target[-20:] from sklearn import linear_model regr = linear_model.LinearRegression(copy_X=True, fit_intercept=True, normalize=False) regr.fit(diabetes_X_train, diabetes_Y_train) #print regr.coef_ #注意因为diabetes_X_train的特征是4维,所以coef_的个数是4+1 = 5 mean_err = np.mean((regr.predict(diabetes_X_test) - diabetes_Y_test) ** 2) score = regr.score(diabetes_X_test, diabetes_Y_test) #这是判断test数据预测程度 print mean_err print score print len(diabetes.data) #样本数目 print len(diabetes.data[0]) #特征维数
def load_diabetes_data(): """ Load the diabetes data set from scikit learn Args: None Returns: diabetes_X_train: Training features for diabetes data set diabetes_X_test: Test set features for diabetes data set diabetes_y_train: Target variables of the training set diabetes_y_test: Target variables of the test set """ diabetes = datasets.load_diabetes() diabetes_X, diabetes_y = diabetes.data, diabetes.target # Split the data set as # 70 % -> Training set # 30 % -> Test set limit = 0.7 * len(diabetes_y) diabetes_X_train = diabetes_X[:limit] diabetes_X_test = diabetes_X[limit:] diabetes_y_train = diabetes_y[:limit] diabetes_y_test = diabetes_y[limit:] return diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test
def test_simple_grnn(self): dataset = datasets.load_diabetes() x_train, x_test, y_train, y_test = train_test_split( dataset.data, dataset.target, train_size=0.7 ) x_train_before = x_train.copy() x_test_before = x_test.copy() y_train_before = y_train.copy() grnnet = algorithms.GRNN(std=0.1, verbose=False) grnnet.train(x_train, y_train) result = grnnet.predict(x_test) error = metrics.mean_absolute_error(result, y_test) old_result = result.copy() self.assertAlmostEqual(error, 46.3358, places=4) # Test problem with variable links np.testing.assert_array_equal(x_train, x_train_before) np.testing.assert_array_equal(x_test, x_test_before) np.testing.assert_array_equal(y_train, y_train_before) x_train[:, :] = 0 result = grnnet.predict(x_test) np.testing.assert_array_almost_equal(result, old_result)
def test_levenberg_marquardt(self): dataset = datasets.load_diabetes() data, target = dataset.data, dataset.target data_scaler = preprocessing.MinMaxScaler() target_scaler = preprocessing.MinMaxScaler() x_train, x_test, y_train, y_test = train_test_split( data_scaler.fit_transform(data), target_scaler.fit_transform(target.reshape(-1, 1)), train_size=0.85 ) # Network lmnet = algorithms.LevenbergMarquardt( connection=[ layers.SigmoidLayer(10), layers.SigmoidLayer(40), layers.OutputLayer(1), ], mu_increase_factor=2, mu=0.1, show_epoch=10, use_bias=False, verbose=False, ) lmnet.train(x_train, y_train, epochs=100) y_predict = lmnet.predict(x_test) error = rmsle(target_scaler.inverse_transform(y_test), target_scaler.inverse_transform(y_predict).round()) error self.assertAlmostEqual(0.4372, error, places=4)
def linearReg(): from sklearn import datasets diabetes = datasets.load_diabetes() diabetes_X_train = diabetes.data[:-20] diabetes_X_test = diabetes.data[-20:] diabetes_y_train = diabetes.target[:-20] diabetes_y_test = diabetes.target[-20:] from sklearn import linear_model regr = linear_model.LinearRegression() regr.fit(diabetes_X_train, diabetes_y_train) print(regr.coef_) import numpy as np np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2) regr.score(diabetes_X_test, diabetes_y_test) X = np.c_[.5, 1].T y = [.5, 1] test = np.c_[0, 2].T regr = linear_model.LinearRegression() import pylab as pl pl.figure() np.random.seed(0) for _ in range(6): this_X = .1 * np.random.normal(size=(2, 1)) + X regr.fit(this_X, y) pl.plot(test, regr.predict(test)) pl.scatter(this_X, y, s=3)
def test_linearsvr_fit_sampleweight(): # check correct result when sample_weight is 1 # check that SVR(kernel='linear') and LinearSVC() give # comparable results diabetes = datasets.load_diabetes() n_samples = len(diabetes.target) unit_weight = np.ones(n_samples) lsvr = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target, sample_weight=unit_weight) score1 = lsvr.score(diabetes.data, diabetes.target) lsvr_no_weight = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target) score2 = lsvr_no_weight.score(diabetes.data, diabetes.target) assert_allclose(np.linalg.norm(lsvr.coef_), np.linalg.norm(lsvr_no_weight.coef_), 1, 0.0001) assert_almost_equal(score1, score2, 2) # check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where # X = X1 repeated n1 times, X2 repeated n2 times and so forth random_state = check_random_state(0) random_weight = random_state.randint(0, 10, n_samples) lsvr_unflat = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target, sample_weight=random_weight) score3 = lsvr_unflat.score(diabetes.data, diabetes.target, sample_weight=random_weight) X_flat = np.repeat(diabetes.data, random_weight, axis=0) y_flat = np.repeat(diabetes.target, random_weight, axis=0) lsvr_flat = svm.LinearSVR(C=1e3).fit(X_flat, y_flat) score4 = lsvr_flat.score(X_flat, y_flat) assert_almost_equal(score3, score4, 2)
def test_Lasso_Path(self): diabetes = datasets.load_diabetes() X = diabetes.data y = diabetes.target X /= X.std(axis=0) df = pdml.ModelFrame(diabetes) df.data /= df.data.std(axis=0, ddof=False) self.assert_numpy_array_almost_equal(df.data.values, X) eps = 5e-3 expected = lm.lasso_path(X, y, eps, fit_intercept=False) result = df.lm.lasso_path(eps=eps, fit_intercept=False) self.assert_numpy_array_almost_equal(expected[0], result[0]) self.assert_numpy_array_almost_equal(expected[1], result[1]) self.assert_numpy_array_almost_equal(expected[2], result[2]) expected = lm.enet_path(X, y, eps=eps, l1_ratio=0.8, fit_intercept=False) result = df.lm.enet_path(eps=eps, l1_ratio=0.8, fit_intercept=False) self.assert_numpy_array_almost_equal(expected[0], result[0]) self.assert_numpy_array_almost_equal(expected[1], result[1]) self.assert_numpy_array_almost_equal(expected[2], result[2]) expected = lm.enet_path(X, y, eps=eps, l1_ratio=0.8, positive=True, fit_intercept=False) result = df.lm.enet_path(eps=eps, l1_ratio=0.8, positive=True, fit_intercept=False) self.assert_numpy_array_almost_equal(expected[0], result[0]) self.assert_numpy_array_almost_equal(expected[1], result[1]) self.assert_numpy_array_almost_equal(expected[2], result[2]) expected = lm.lars_path(X, y, method='lasso', verbose=True) result = df.lm.lars_path(method='lasso', verbose=True) self.assert_numpy_array_almost_equal(expected[0], result[0]) self.assert_numpy_array_almost_equal(expected[1], result[1]) self.assert_numpy_array_almost_equal(expected[2], result[2])
def test_hessian_diagonal(self): dataset = datasets.load_diabetes() data, target = dataset.data, dataset.target input_scaler = preprocessing.StandardScaler() target_scaler = preprocessing.StandardScaler() x_train, x_test, y_train, y_test = cross_validation.train_test_split( input_scaler.fit_transform(data), target_scaler.fit_transform(target.reshape(-1, 1)), train_size=0.8 ) nw = algorithms.HessianDiagonal( connection=[ layers.SigmoidLayer(10), layers.SigmoidLayer(20), layers.OutputLayer(1) ], step=1.5, shuffle_data=False, verbose=False, min_eigenvalue=1e-10 ) nw.train(x_train, y_train, epochs=10) y_predict = nw.predict(x_test) error = rmsle(target_scaler.inverse_transform(y_test), target_scaler.inverse_transform(y_predict).round()) self.assertAlmostEqual(0.5032, error, places=4)
def test_mixture_of_experts(self): dataset = datasets.load_diabetes() data, target = asfloat(dataset.data), asfloat(dataset.target) insize, outsize = data.shape[1], 1 input_scaler = preprocessing.MinMaxScaler((-1 ,1)) output_scaler = preprocessing.MinMaxScaler() x_train, x_test, y_train, y_test = cross_validation.train_test_split( input_scaler.fit_transform(data), output_scaler.fit_transform(target.reshape(-1, 1)), train_size=0.8 ) n_epochs = 10 scaled_y_test = output_scaler.inverse_transform(y_test) scaled_y_test = scaled_y_test.reshape((y_test.size, 1)) # -------------- Train single GradientDescent -------------- # bpnet = algorithms.GradientDescent( (insize, 20, outsize), step=0.1, verbose=False ) bpnet.train(x_train, y_train, epochs=n_epochs) network_output = bpnet.predict(x_test) network_error = rmsle(output_scaler.inverse_transform(network_output), scaled_y_test) # -------------- Train ensemlbe -------------- # moe = algorithms.MixtureOfExperts( networks=[ algorithms.Momentum( (insize, 20, outsize), step=0.1, batch_size=1, verbose=False ), algorithms.Momentum( (insize, 20, outsize), step=0.1, batch_size=1, verbose=False ), ], gating_network=algorithms.Momentum( layers.Softmax(insize) > layers.Output(2), step=0.1, verbose=False ) ) moe.train(x_train, y_train, epochs=n_epochs) ensemble_output = moe.predict(x_test) ensemlbe_error = rmsle( output_scaler.inverse_transform(ensemble_output), scaled_y_test ) self.assertGreater(network_error, ensemlbe_error)
def test_pipeline(self): dataset = datasets.load_diabetes() target_scaler = preprocessing.MinMaxScaler() target = dataset.target.reshape(-1, 1) x_train, x_test, y_train, y_test = train_test_split( dataset.data, target_scaler.fit_transform(target), train_size=0.85 ) network = algorithms.GradientDescent( connection=[ layers.Input(10), layers.Sigmoid(25), layers.Sigmoid(1), ], show_epoch=100, verbose=False, ) pipeline = Pipeline([ ('min_max_scaler', preprocessing.MinMaxScaler()), ('gd', network), ]) pipeline.fit(x_train, y_train, gd__epochs=50) y_predict = pipeline.predict(x_test) error = rmsle(target_scaler.inverse_transform(y_test), target_scaler.inverse_transform(y_predict).round()) self.assertAlmostEqual(0.48, error, places=2)
def gmm_clustering(): conversion = { 0: 2, 1: 0, 2: 1, } g = mixture.GMM(n_components=3) iris_data = datasets.load_iris() diabetes_data = datasets.load_diabetes() data = iris_data # Generate random observations with two modes centered on 0 # and 10 to use for training. np.random.seed(0) obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1))) g.fit(data.data) print("Target classification") print(data.target) results = g.predict(data.data) results = [conversion[item] for item in results] print("\nResults") print(np.array(results)) compare = [results[i] == data.target[i] for i in range(len(results))] accuracy_count = [item for item in compare if item == True] print("\nAccuracy: {:.0%}".format(float(len(accuracy_count)) / len(compare))) print(max(data.target))
def main(): diabetes = datasets.load_diabetes() # Use only one feature diabetes_X = diabetes.data[:, np.newaxis, 2] diabetes_X = scale(diabetes_X) diabetes_y = scale(diabetes.target) diabetes_X_train = diabetes_X[:-20] diabetes_X_test = diabetes_X[-20:] # diabetes_y_train = diabetes.target[:-20] # diabetes_y_test = diabetes.target[-20:] diabetes_y_train = diabetes_y[:-20] diabetes_y_test = diabetes_y[-20:] # regr = linear_model.LinearRegression() regr = LinearRegression(n_iter=50, fit_alg="batch") # regr = LinearRegressionNormal() regr.fit(diabetes_X_train, diabetes_y_train) # regr.fit(np.array([[0, 0], [1, 1], [2, 2]]), np.array([0, 1, 2])) # print(regr.predict(np.array([[3, 3]]))) # print('Coefficients: \n', regr.coef_) # print("Residual sum of squares: %.2f" # % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2)) print("Variance score: %.2f" % regr.score(diabetes_X_test, diabetes_y_test))
def test_grid_search(self): def scorer(network, X, y): result = network.predict(X) return rmsle(result[:, 0], y) dataset = datasets.load_diabetes() x_train, x_test, y_train, y_test = train_test_split( dataset.data, dataset.target, train_size=0.7 ) grnnet = algorithms.GRNN(std=0.5, verbose=False) grnnet.train(x_train, y_train) error = scorer(grnnet, x_test, y_test) self.assertAlmostEqual(0.513, error, places=3) random_search = model_selection.RandomizedSearchCV( grnnet, param_distributions={'std': np.arange(1e-2, 0.1, 1e-4)}, n_iter=10, scoring=scorer, random_state=self.random_seed ) random_search.fit(dataset.data, dataset.target) scores = random_search.cv_results_ best_score = min(scores['mean_test_score']) self.assertAlmostEqual(0.4266, best_score, places=3)
def test_grid_search(self): def scorer(network, X, y): result = network.predict(X) return rmsle(result, y) dataset = datasets.load_diabetes() x_train, x_test, y_train, y_test = train_test_split( dataset.data, dataset.target, train_size=0.7 ) grnnet = algorithms.GRNN(std=0.5, verbose=False) grnnet.train(x_train, y_train) error = scorer(grnnet, x_test, y_test) self.assertAlmostEqual(0.513, error, places=3) random_search = grid_search.RandomizedSearchCV( grnnet, param_distributions={'std': np.arange(1e-2, 1, 1e-4)}, n_iter=10, scoring=scorer ) random_search.fit(dataset.data, dataset.target) scores = random_search.grid_scores_ best_score = min(scores, key=itemgetter(1)) self.assertAlmostEqual(0.452, best_score[1], places=3)
def test_pipeline(self): dataset = datasets.load_diabetes() target_scaler = preprocessing.MinMaxScaler() target = dataset.target.reshape(-1, 1) x_train, x_test, y_train, y_test = train_test_split( dataset.data, target_scaler.fit_transform(target), train_size=0.85 ) network = algorithms.Backpropagation( connection=[ layers.SigmoidLayer(10), layers.SigmoidLayer(40), layers.OutputLayer(1), ], use_bias=True, show_epoch=100, verbose=False, ) pipeline = Pipeline([ ('min_max_scaler', preprocessing.MinMaxScaler()), ('backpropagation', network), ]) pipeline.fit(x_train, y_train, backpropagation__epochs=1000) y_predict = pipeline.predict(x_test) error = rmsle(target_scaler.inverse_transform(y_test), target_scaler.inverse_transform(y_predict).round()) self.assertAlmostEqual(0.4481, error, places=4)
def get_data(n_clients): """ Import the dataset via sklearn, shuffle and split train/test. Return training, target lists for `n_clients` and a holdout test set """ print("Loading data") diabetes = load_diabetes() y = diabetes.target X = diabetes.data # Add constant to emulate intercept X = np.c_[X, np.ones(X.shape[0])] # The features are already preprocessed # Shuffle perm = np.random.permutation(X.shape[0]) X, y = X[perm, :], y[perm] # Select test at random test_size = 50 test_idx = np.random.choice(X.shape[0], size=test_size, replace=False) train_idx = np.ones(X.shape[0], dtype=bool) train_idx[test_idx] = False X_test, y_test = X[test_idx, :], y[test_idx] X_train, y_train = X[train_idx, :], y[train_idx] # Split train among multiple clients. # The selection is not at random. We simulate the fact that each client # sees a potentially very different sample of patients. X, y = [], [] step = int(X_train.shape[0] / n_clients) for c in range(n_clients): X.append(X_train[step * c: step * (c + 1), :]) y.append(y_train[step * c: step * (c + 1)]) return X, y, X_test, y_test
def test_simple_grnn(self): dataset = datasets.load_diabetes() x_train, x_test, y_train, y_test = train_test_split( dataset.data, dataset.target, train_size=0.7 ) x_train_before = x_train.copy() x_test_before = x_test.copy() y_train_before = y_train.copy() grnnet = algorithms.GRNN(std=0.1, verbose=False) grnnet.train(x_train, y_train) result = grnnet.predict(x_test) error = rmsle(result, y_test) old_result = result.copy() self.assertAlmostEqual(error, 0.4245, places=4) # Test problem with variable links np.testing.assert_array_equal(x_train, x_train_before) np.testing.assert_array_equal(x_test, x_test_before) np.testing.assert_array_equal(y_train, y_train_before) x_train[:, :] = 0 result = grnnet.predict(x_test) total_classes_prob = np.round(result.sum(axis=1), 10) np.testing.assert_array_almost_equal(result, old_result)
def test_ElasticnetWeights(): """Test elastic net with different weight for each predictor alpha: a vector of weight, small # means prior knowledge 1 : means no prior knowledge """ # Has 10 features diabetes = datasets.load_diabetes() # pprint(diabetes) print("Size of data:{}".format(diabetes.data.shape)) X = diabetes.data y = diabetes.target X /= X.std(axis=0) # Standardize data (easier to set the l1_ratio parameter) eps = 5e-3 # the smaller it is the longer is the path alphas = np.arange(2, 4, 0.2) alphas = np.append(alphas, 2.27889) # best aplpha from cv # Computing regularization path using the lasso alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps, fit_intercept=False, alphas=alphas) # Computing regularization path using the elastic net alphas_enet, coefs_enet, _ = enet_path( X, y, eps=eps, l1_ratio=0.8, fit_intercept=False, alphas=alphas) # ElasticnetCV num_predict = X.shape[1] alphas = np.zeros(num_predict) alphas.fill(1) val = 0.1 alphas[2] = val alphas[3] = val alphas[6] = val enetCV_alpha, enetCV_coef = runPrintResults(X,y, None, "EnetCV") runPrintResults(X,y, alphas, "EnetCVWeight 1") # print("coefs_enet: {}".format(coefs_enet[:, -1])) # print("coefs_lasso: {}".format(coefs_lasso[:, -1])) # Display results plt.figure(1) ax = plt.gca() ax.set_color_cycle(2 * ['b', 'r', 'g', 'c', 'k']) l1 = plt.plot(alphas_lasso, coefs_lasso.T) l2 = plt.plot(alphas_enet, coefs_enet.T, linestyle='--') # repeat alpha for x-axis values for plotting enetCV_alphaVect = [enetCV_alpha] * num_predict l3 = plt.scatter(enetCV_alphaVect, enetCV_coef, marker='x') plt.xlabel('alpha') plt.ylabel('coefficients') plt.title('Lasso and Elastic-Net Paths') plt.legend((l1[-1], l2[-1]), ('Lasso', 'Elastic-Net'), loc='upper right') plt.axis('tight') plt.savefig("fig/lassoEnet")
def bagging_regression(): digits = load_diabetes() x = digits.data y = digits.target sample_parameter = { 'n_jobs': -1, 'min_samples_leaf': 2.0, 'n_estimators': 500, 'max_features': 0.55, 'criterion': 'mse', 'min_samples_split': 4.0, 'model': 'RFREG', 'max_depth': 4.0 } x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42) clf_layer = mlc.layer.layer.RegressionLayer() print "single prediction" #y_train_predict,y_test_predict = clf_layer.predict(x_train,y_train,x_test,sample_parameter) #print y_test_predict y_train_predict_proba,y_test_predict_proba = clf_layer.predict(x_train,y_train,x_test,sample_parameter) #print y_test_predict_proba print evaluate_function(y_test,y_test_predict_proba,'mean_squared_error') print "multi ensamble prediction" multi_bagging_clf = mlc.layer.layer.RegressionBaggingLayer() y_train_predict_proba,y_test_predict_proba = multi_bagging_clf.predict(x_train,y_train,x_test,sample_parameter,times=5) print evaluate_function(y_test,y_test_predict_proba,'mean_squared_error')
def load_data(self, shuffled=True): samples = load_diabetes() if shuffled: self.X = shuffle(samples.data, random_state=self.SEED) self.y = shuffle(samples.target, random_state=self.SEED) else: self.X, self.y = samples.data, samples.target self.n_features = len(self.X[0])
def test_regression_plot_3d(self): df = pdml.ModelFrame(datasets.load_diabetes()) df.data = df.data[[0, 2]] df.fit(df.linear_model.LinearRegression()) ax = df.plot_estimator() from mpl_toolkits.mplot3d import Axes3D self.assertIsInstance(ax, Axes3D)
def test_gridsearch(self): import sklearn.grid_search as gs tuned_parameters = {'statsmodel': [sm.OLS, sm.GLS]} diabetes = datasets.load_diabetes() cv = gs.GridSearchCV(base.StatsModelsRegressor(sm.OLS), tuned_parameters, cv=5, scoring=None) fitted = cv.fit(diabetes.data, diabetes.target) self.assertTrue(fitted.best_estimator_.statsmodel is sm.OLS)
def cross_validated_estimators(): lasso = linear_model.LassoCV() diabetes = datasets.load_diabetes() X_diabetes = diabetes.data y_diabetes = diabetes.target print(lasso.fit(X_diabetes, y_diabetes)) # The estimator chose automatically its lambda: print(lasso.alpha_)
def feature_correlation_pearson( path="images/feature_correlation_pearson.png"): data = datasets.load_diabetes() X, y = data['data'], data['target'] feature_names = np.array(data['feature_names']) visualizer = FeatureCorrelation(labels=feature_names) visualizer.fit(X, y) visualizer.poof(outpath=path, clear_figure=True)
def test_regression_scores(): diabetes = load_diabetes() X, y = diabetes.data, diabetes.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = Ridge() clf.fit(X_train, y_train) score1 = SCORERS['r2'](clf, X_test, y_test) score2 = r2_score(y_test, clf.predict(X_test)) assert_almost_equal(score1, score2)
def get_data(): diabetes = datasets.load_diabetes() x = diabetes.data y = diabetes.target cases_num = 447 x = x[:cases_num, :] y = y[:cases_num] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) return x_train, y_train, x_test, y_test
def test_load_diabetes(): res = load_diabetes() assert_equal(res.data.shape, (442, 10)) assert_true(res.target.size, 442) assert_equal(len(res.feature_names), 10) assert_true(res.DESCR) # test return_X_y option check_return_X_y(res, partial(load_diabetes))
def create_diabetes(): diabetes_data = datasets.load_diabetes() x = diabetes_data.data y = diabetes_data.target for i in range(x.shape[1]): xi = array_functions.normalize(x[:, i]) yi = array_functions.normalize(y) array_functions.plot_2d(xi, yi) pass assert False
# Virgile Fritsch <*****@*****.**> # # License: BSD 3 clause import numpy as np import pytest from sklearn import datasets from sklearn.covariance import empirical_covariance, EmpiricalCovariance, \ ShrunkCovariance, shrunk_covariance, \ LedoitWolf, ledoit_wolf, ledoit_wolf_shrinkage, OAS, oas from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_warns X, _ = datasets.load_diabetes(return_X_y=True) X_1d = X[:, 0] n_samples, n_features = X.shape def test_covariance(): # Tests Covariance module on a simple dataset. # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
assert_raises(ValueError, clf.predict, sparse.lil_matrix(X)) Xt = np.array(X).T clf.fit(np.dot(X, Xt), Y) assert_raises(ValueError, clf.predict, X) clf = svm.SVC() clf.fit(X, Y) assert_raises(ValueError, clf.predict, Xt) @pytest.mark.parametrize( 'Estimator, data', [(svm.SVC, datasets.load_iris(return_X_y=True)), (svm.NuSVC, datasets.load_iris(return_X_y=True)), (svm.SVR, datasets.load_diabetes(return_X_y=True)), (svm.NuSVR, datasets.load_diabetes(return_X_y=True)), (svm.OneClassSVM, datasets.load_iris(return_X_y=True))]) def test_svm_gamma_error(Estimator, data): X, y = data est = Estimator(gamma='auto_deprecated') err_msg = "When 'gamma' is a string, it should be either 'scale' or 'auto'" with pytest.raises(ValueError, match=err_msg): est.fit(X, y) def test_unicode_kernel(): # Test that a unicode kernel name does not cause a TypeError clf = svm.SVC(kernel='linear', probability=True) clf.fit(X, Y) clf.predict_proba(T)
from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn import datasets import pandas as pd import numpy as np import matplotlib.pyplot as plt # 读入数据集 diabetes = datasets.load_diabetes() # diabetes为有三个key的字典 # 拆分数据 data = diabetes['data'] target = diabetes['target'] feature_names = diabetes['feature_names'] # print(data.shape) # print(target.shape) # print(feature_names) print(data) df = pd.DataFrame(data, columns=feature_names) # print(df.head()) # print(df.info()) train_X, test_X, train_Y, test_Y = train_test_split(data, target, train_size=0.8, test_size=0.2) model = LinearRegression() model.fit(train_X, train_Y) LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) # plt.figure(figsize=(12,25)) # for i,col in enumerate(df.columns): # train_X = df.loc[:,col].reshape(-1,1)
import scipy from sklearn.datasets import load_diabetes from sklearn.metrics import make_scorer from julearn.scoring import register_scorer from julearn import run_cross_validation from julearn.utils import configure_logging ############################################################################### # Set the logging level to info to see extra information configure_logging(level='INFO') ############################################################################### # load the diabetes data from sklearn as a pandas dataframe features, target = load_diabetes(return_X_y=True, as_frame=True) ############################################################################### # Dataset contains ten variables age, sex, body mass index, average blood # pressure, and six blood serum measurements (s1-s6) diabetes patients and # a quantitative measure of disease progression one year after baseline which # is the target we are interested in predicting. print('Features: \n', features.head()) # type: ignore print('Target: \n', target.describe()) # type: ignore ############################################################################### # Let's combine features and target together in one dataframe and define X # and y data_diabetes = pd.concat([features, target], axis=1) # type: ignore
def setUp(self): self.v = verbosity self.clf = Feat(verbosity=verbosity, n_threads=1) diabetes = load_diabetes() self.X = diabetes.data self.y = diabetes.target
import pandas as pd from sklearn import datasets, linear_model from sklearn.model_selection import train_test_split from matplotlib import pyplot as plt columns = "age sex bmi map tc ldl hdl tch ltg glu".split( ) # Declare the columns names # datasets="/home/hadoop/hadoop/hadoop_working/DataScience/code/sklearn/diabetic.txt" diabetes = datasets.load_diabetes() # Call the diabetes dataset from sklearn df = pd.DataFrame(diabetes.data, columns=columns) # load the dataset as a pandas data frame y = diabetes.target # define the target variable (dependent variable) as y # create training and testing vars X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2) print(X_train.shape, y_train.shape) print(X_test.shape, y_test.shape) # fit a model lm = linear_model.LinearRegression() model = lm.fit(X_train, y_train) predictions = lm.predict(X_test) predictions[0:5] ## The line / model plt.scatter(y_test, predictions)
import matplotlib.pyplot as plt import numpy as np from sklearn import datasets, linear_model from sklearn.metrics import mean_squared_error diabetes_data = datasets.load_diabetes() # ['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'] # print(diabetes_data.keys()) # print(diabetes_data.data) # This prints the entire data's arrays # print(diabetes_data.DESCR) # Below we're selecting one label and one feature # This code gives the column second to diabetes_X and np converts it into numpy array (array of arrays) # diabetes_X = diabetes_data.data[:, np.newaxis, 2] diabetes_X = diabetes_data.data # This select all the features # print(diabetes_X) # Now we're doing train test splitting diabetes_X_train = diabetes_X[:-20] # Here we're selecting last 20 features diabetes_X_test = diabetes_X[-20:] # Here we're selecting first 20 fetures diabetes_y_train = diabetes_data.target[: -20] # The corresponding label for the X Train features diabetes_y_test = diabetes_data.target[-20:] # Same for the X test model = linear_model.LinearRegression() model.fit(diabetes_X_train, diabetes_y_train)
# Importando as packages from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.pipeline import make_pipeline from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn.linear_model import LinearRegression from sklearn.datasets import load_diabetes # Importando os dados # Dez variáveis relacionadas a idade, sexo, índice de massa corporal, pressão arterial média #e seis medidas séricas foram obtidas para cada um dos 442 pacientes com diabetes, df = load_diabetes() #Visualizando as features do dataset: df.feature_names #Visualizando dados: df.data[0:5, ] # Definindo as variáveis dependentes/independentes. X = df.data y = df.target # Dividindo o dataset em conjunto de treinamento e testes X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
# # Bộ dữ liệu đầu vào được sử dụng cho ví dụ này là diabetes. Thông tin về bộ dữ liệu này bạn đọc có thể tham khảo tại [sklearn diabetes dataset](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset). # # Mục tiêu của mô hình là từ 10 biến đầu vào là những thông tin liên quan tới người bệnh bao gồm `age, sex, body mass index, average blood pressure` và 6 chỉ số `blood serum`. Chúng ta sẽ dự báo biến mục tiêu là một thước đo định lượng sự tiến triển của bệnh sau 1 năm điều trị. # In[1]: import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.linear_model import Ridge from sklearn.datasets import load_diabetes X,y = load_diabetes(return_X_y=True) features = load_diabetes()['feature_names'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # In[2]: import numpy as np import matplotlib.pyplot as plt # Thay đổi alphas từ 1 --> 100 n_alphas = 200 alphas = 1/np.logspace(1, -2, n_alphas) coefs = []
from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split import pandas as pd import numpy as np # 1. 데이터 dataset = load_diabetes() x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, train_size=0.8, random_state=32) model = RandomForestRegressor(max_depth=4) model.fit(x_train, y_train) #4. 평가, 예측 acc = model.score(x_test, y_test) print(model.feature_importances_) print("acc :", acc) df = pd.DataFrame(dataset.data, columns=dataset.feature_names) new_data = [] feature = [] a = np.percentile(model.feature_importances_, q=25) for i in range(len(dataset.data[0])): if model.feature_importances_[i] > a: new_data.append(df.iloc[:, i])
import matplotlib.pyplot as plot import numpy from sklearn import datasets, linear_model, metrics from sklearn.svm import SVC from sklearn.model_selection import train_test_split # Loading diabetes Dataset diabetesdataset = datasets.load_diabetes() X = diabetesdataset.data Y = diabetesdataset.target # Training and Testing Data X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=21) #Cross-Validation #C = 1.0 --> SVM Regularization Parameter #Poly Kernel clf = SVC(kernel='poly', degree=4, C=1.0, gamma=0.1).fit(X_train, Y_train) clf.fit(X_test, Y_test) y_pred = clf.predict(X_test) print(" poly Kernel Accuracy :", metrics.accuracy_score(Y_test, y_pred) * 100) # Increasing Random State increases accuracy
import numpy as np from sklearn.datasets import load_diabetes from sklearn.utils.testing import assert_almost_equal from lightning.ranking import PRank from lightning.ranking import KernelPRank bunch = load_diabetes() X, y = bunch.data, bunch.target y = np.round(y, decimals=-2) def test_prank(): est = PRank(n_iter=10, shuffle=False, random_state=0) est.fit(X, y) assert_almost_equal(est.score(X, y), 41.86, 2) est = PRank(n_iter=10, shuffle=True, random_state=0) est.fit(X, y) assert_almost_equal(est.score(X, y), 71.04, 2) def test_prank_linear_kernel(): est = KernelPRank(kernel="linear", n_iter=10, shuffle=False, random_state=0) est.fit(X, y) assert_almost_equal(est.score(X, y), 41.86, 2)
def train_data(): bunch = load_diabetes() X, y = bunch.data, bunch.target y = np.round(y, decimals=-2) return X, y
n_neighbors=1, mode='distance', metric='euclidean', include_self=False, n_jobs=-1) closest_distances = kg.toarray()[np.where(kg.toarray() > 0)] eps = closest_distances.max() clustering = DBSCAN(eps=eps, min_samples=2, leaf_size=30, n_jobs=-1).fit(ball_allies_adv) labels = clustering.labels_ # justif = int(labels[0] == labels[1]) # return justif return int(labels[0] == labels[1]) if (__name__ == '__main__'): from sklearn.datasets import load_diabetes from sklearn.ensemble import RandomForestClassifier X, y = load_diabetes().data, load_diabetes().target y = 2 * ((y > y.mean()).astype(int)) - 1 clf = RandomForestClassifier(n_estimators=10) clf = clf.fit(X[:100], y[:100]) print(evaluation_test(X[0], clf, y[0], X[1:], y[1:])) # from sklearn.model_selection import train_test_split # X = load_diabetes().data # X_tr, X_ts = train_test_split(X, test_size=0.1, random_state=0) # lof = myLocalOutlierFactor(n_neighbors=1, metric='minkowski', p=2) # lof = lof.fit(X_tr) # print('LocalReachabilityDensity: \n {}'.format(lof.local_reachability_density(X_tr[:10]))) # print('LocalOutlierFactor: \n {}'.format(lof.local_outlier_factor(X_ts)))
# Estimate the score after chained imputation of the missing values estimator = make_pipeline( make_union(ChainedImputer(missing_values=0, random_state=0), MissingIndicator(missing_values=0)), RandomForestRegressor(random_state=0, n_estimators=100)) chained_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), (chained_impute_scores.mean(), chained_impute_scores.std())) results_diabetes = np.array(get_results(load_diabetes())) mses_diabetes = results_diabetes[:, 0] * -1 stds_diabetes = results_diabetes[:, 1] results_boston = np.array(get_results(load_boston())) mses_boston = results_boston[:, 0] * -1 stds_boston = results_boston[:, 1] n_bars = len(mses_diabetes) xval = np.arange(n_bars) x_labels = ['Full data', 'Zero imputation', 'Mean Imputation', 'Chained Imputation'] colors = ['r', 'g', 'b', 'orange']
giữa các phản hồi quan sát được trong tập dữ liệu và các phản hồi được dự đoán bởi các xấp xỉ tuyến tính. Các hệ số, tổng bình phương còn lại và điểm phương sai cũng tính toán. """ print(__doc__) import matplotlib.pyplot as plt import numpy as np from sklearn import datasets, linear_model from sklearn.metrics import mean_squared_error, r2_score # Tai tap du lieu benh tieu duong. benhTieuDuong = datasets.load_diabetes() # Chi su dung mot tinh nang. benhTieuDuong_X = benhTieuDuong.data[:, np.newaxis, 2] # Tách dữ liệu vào đào tạo / thử nghiệm bộ. benhTieuDuong_X_train = benhTieuDuong_X[:-20] benhTieuDuong_X_test = benhTieuDuong_X[-20:] # Tách các mục tiêu vào đào tạo / thử nghiệm bộ. benhTieuDuong_y_train = benhTieuDuong.target[:-20] # 0 -> size - 20 benhTieuDuong_y_test = benhTieuDuong.target[-20:] # size - 20 -> size # Tạo đối tượng hồi quy tuyến tính. linearRegression = linear_model.LinearRegression()
import numpy as np from sklearn.datasets import load_diabetes from sklearn.tree import DecisionTreeRegressor from sklearn.utils.testing import assert_almost_equal from ivalice.ranking import LambdaMART data = load_diabetes() X, y = data.data, data.target y /= (y.max() - y.min()) def test_lambda_mart_ndcg(): for gains in ("linear", "exponential"): reg = DecisionTreeRegressor() lm = LambdaMART(reg, n_estimators=10, max_rank=10, gains=gains) lm.fit(X, y) ndcg = lm.score(X, y) assert_almost_equal(ndcg, 1.0)
def Main(): import argparse import numpy as np from sklearn.datasets import load_diabetes from chainer import cuda, Variable, FunctionSet, optimizers import chainer.functions as F parser = argparse.ArgumentParser(description='Chainer example: regression') parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() batchsize = 13 n_epoch = 100 n_units = 30 # Prepare dataset print 'fetch diabetes dataset' diabetes = load_diabetes() data = diabetes['data'].astype(np.float32) target = diabetes['target'].astype(np.float32).reshape( len(diabetes['target']), 1) N = batchsize * 30 #Number of training data x_train, x_test = np.split(data, [N]) y_train, y_test = np.split(target, [N]) N_test = y_test.size print 'Num of samples for train:', len(y_train) print 'Num of samples for test:', len(y_test) # Dump data for plot: fp1 = file('/tmp/smpl_train.dat', 'w') for x, y in zip(x_train, y_train): fp1.write('%s #%i# %s\n' % (' '.join(map(str, x)), len(x) + 1, ' '.join(map(str, y)))) fp1.close() # Dump data for plot: fp1 = file('/tmp/smpl_test.dat', 'w') for x, y in zip(x_test, y_test): fp1.write('%s #%i# %s\n' % (' '.join(map(str, x)), len(x) + 1, ' '.join(map(str, y)))) fp1.close() # Prepare multi-layer perceptron model model = FunctionSet(l1=F.Linear(10, n_units), l2=F.Linear(n_units, n_units), l3=F.Linear(n_units, 1)) if args.gpu >= 0: cuda.init(args.gpu) model.to_gpu() # Neural net architecture def forward(x_data, y_data, train=True): x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.relu(model.l1(x)), train=train) h2 = F.dropout(F.relu(model.l2(h1)), train=train) y = model.l3(h2) return F.mean_squared_error(y, t), y # Setup optimizer optimizer = optimizers.AdaDelta(rho=0.9) optimizer.setup(model.collect_parameters()) # Learning loop for epoch in xrange(1, n_epoch + 1): print 'epoch', epoch # training perm = np.random.permutation(N) sum_loss = 0 for i in xrange(0, N, batchsize): x_batch = x_train[perm[i:i + batchsize]] y_batch = y_train[perm[i:i + batchsize]] if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) optimizer.zero_grads() loss, pred = forward(x_batch, y_batch) loss.backward() optimizer.update() sum_loss += float(cuda.to_cpu(loss.data)) * batchsize print 'train mean loss={}'.format(sum_loss / N) ''' # testing per batch sum_loss = 0 preds = [] for i in xrange(0, N_test, batchsize): x_batch = x_test[i:i+batchsize] y_batch = y_test[i:i+batchsize] if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) loss, pred = forward(x_batch, y_batch, train=False) preds.extend(cuda.to_cpu(pred.data)) sum_loss += float(cuda.to_cpu(loss.data)) * batchsize pearson = np.corrcoef(np.asarray(preds).reshape(len(preds),), np.asarray(y_test).reshape(len(preds),)) #''' #''' # testing all data preds = [] x_batch = x_test[:] y_batch = y_test[:] if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) loss, pred = forward(x_batch, y_batch, train=False) preds = cuda.to_cpu(pred.data) sum_loss = float(cuda.to_cpu(loss.data)) * len(y_test) pearson = np.corrcoef( np.asarray(preds).reshape(len(preds), ), np.asarray(y_test).reshape(len(preds), )) #''' print 'test mean loss={}, corrcoef={}'.format(sum_loss / N_test, pearson[0][1]) # Dump data for plot: fp1 = file('/tmp/nn_test%04i.dat' % epoch, 'w') for x, y in zip(x_test, preds): fp1.write( '%s #%i# %s\n' % (' '.join(map(str, x)), len(x) + 1, ' '.join(map(str, y)))) fp1.close()
from sklearn.metrics import mean_absolute_error from sklearn.neighbors import KNeighborsRegressor from sklearn.preprocessing import StandardScaler from sklearn.svm import SVR from sklearn.tree import DecisionTreeRegressor from ELM import ELMRegressor test_maes_dictionary = dict() plt.style.use('ggplot') sns.set_context("talk") np.random.seed(0) ## DATA PREPROCESSING X, y = load_diabetes().values() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2) stdScaler_data = StandardScaler() X_train = stdScaler_data.fit_transform(X_train) X_test = stdScaler_data.transform(X_test) stdScaler_target = StandardScaler() y_train = stdScaler_target.fit_transform(y_train) # /max(y_train) y_test = stdScaler_target.transform(y_test) # /max(y_train) max_y_train = max(abs(y_train)) y_train = y_train / max_y_train y_test = y_test / max_y_train
def main(): EPSILON = 1e-4 X, y = datasets.load_diabetes(return_X_y=True) rng = np.random.RandomState(42) X = np.c_[X, rng.randn(X.shape[0], 14)] # add some bad features # normalize data as done by Lars to allow for comparison X /= np.sqrt(np.sum(X ** 2, axis=0)) # ############################################################################# # LassoLarsIC: least angle regression with BIC/AIC criterion model_bic = LassoLarsIC(criterion='bic') t1 = time.time() model_bic.fit(X, y) t_bic = time.time() - t1 alpha_bic_ = model_bic.alpha_ model_aic = LassoLarsIC(criterion='aic') model_aic.fit(X, y) alpha_aic_ = model_aic.alpha_ def plot_ic_criterion(model, name, color): criterion_ = model.criterion_ plt.semilogx(model.alphas_ + EPSILON, criterion_, '--', color=color, linewidth=3, label='%s criterion' % name) plt.axvline(model.alpha_ + EPSILON, color=color, linewidth=3, label='alpha: %s estimate' % name) plt.xlabel(r'$\alpha$') plt.ylabel('criterion') plt.figure() plot_ic_criterion(model_aic, 'AIC', 'b') plot_ic_criterion(model_bic, 'BIC', 'r') plt.legend() plt.title('Information-criterion for model selection (training time %.3fs)' % t_bic) # ############################################################################# # LassoCV: coordinate descent # Compute paths print("Computing regularization path using the coordinate descent lasso...") t1 = time.time() model = LassoCV(cv=20).fit(X, y) t_lasso_cv = time.time() - t1 # Display results plt.figure() ymin, ymax = 2300, 3800 plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ':') plt.plot(model.alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(model.alpha_ + EPSILON, linestyle='--', color='k', label='alpha: CV estimate') plt.legend() plt.xlabel(r'$\alpha$') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent ' '(train time: %.2fs)' % t_lasso_cv) plt.axis('tight') plt.ylim(ymin, ymax) # ############################################################################# # LassoLarsCV: least angle regression # Compute paths print("Computing regularization path using the Lars lasso...") t1 = time.time() model = LassoLarsCV(cv=20).fit(X, y) t_lasso_lars_cv = time.time() - t1 # Display results plt.figure() plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ':') plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(model.alpha_, linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel(r'$\alpha$') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: Lars (train time: %.2fs)' % t_lasso_lars_cv) plt.axis('tight') plt.ylim(ymin, ymax) plt.show()
def diabetes(): return load_diabetes()
Training a pipeline +++++++++++++++++++ """ from pyquickhelper.helpgen.graphviz_helper import plot_graphviz import numpy from onnxruntime import InferenceSession from sklearn.datasets import load_diabetes from sklearn.ensemble import (GradientBoostingRegressor, RandomForestRegressor, VotingRegressor) from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from skl2onnx import to_onnx from mlprodict.onnxrt import OnnxInference X, y = load_diabetes(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) # Train classifiers reg1 = GradientBoostingRegressor(random_state=1, n_estimators=5) reg2 = RandomForestRegressor(random_state=1, n_estimators=5) reg3 = LinearRegression() ereg = Pipeline(steps=[ ('voting', VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)])), ]) ereg.fit(X_train, y_train) ################################# # Converts the model # ++++++++++++++++++
def test_precict(self): diabetes = datasets.load_diabetes() estimator = base.StatsModelsRegressor(sm.OLS) with self.assertRaisesRegexp( ValueError, 'StatsModelsRegressor is not fitted to data'): estimator.predict(diabetes.data)
from sys import platform from sklearn import datasets from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split from xgboost import XGBClassifier, plot_importance import matplotlib.pyplot as plt import numpy as np import pandas as pd from xgboost.sklearn import XGBRegressor datasets = load_diabetes() x_train, x_test, y_train, y_test = train_test_split(datasets.data, datasets.target, train_size=0.8, random_state=104) #2 # model = GradientBoostingRegressor(max_depth=4) model = XGBRegressor(n_jobs=-1, use_label_encoder=False) #3 model.fit(x_train, y_train, eval_metric='mlogloss') #4
import matplotlib.pyplot as plt import numpy as np from sklearn import datasets, linear_model from sklearn.metrics import mean_squared_error diabetes = datasets.load_diabetes() # loading dataset of diabetes print(diabetes.keys()) # it shows keys from dataset # (['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename']) print(diabetes.data) # it shows data in numpy array form print(diabetes.DESCR) # it gives description of dataset # diabetes_X = diabetes.data[:, np.newaxis, 3] # : gets all values from 3rd column and put init numpy array # print(diabetes_X) # simple linear regression diabetes_X = diabetes.data # giving all data of all types of features i.e. multiple regression (we cant plot this) diabetes_X_train = diabetes_X[: -30] # getting feature values except last 30 values to train diabetes_X_test = diabetes_X[ -30:] # getting last 30 feature values to test our programme diabetes_Y_train = diabetes.target[: -30] # these are the labels for train model diabetes_Y_test = diabetes.target[-30:] # these are labels for test model model = linear_model.LinearRegression() # making model of linear regression model.fit(diabetes_X_train, diabetes_Y_train) # giving value to model to train diabetes_Y_predicted = model.predict( diabetes_X_test) # giving values to model to predict
from sklearn.datasets import load_diabetes from sklearn import linear_model import matplotlib.pyplot as plt import pandas diabetes = load_diabetes() diabetes.keys() print(diabetes.DESCR) tabela = pandas.DataFrame(diabetes.data) tabela.columns = diabetes.feature_names tabela.head(10) tabela['Taxa'] = diabetes.target print(tabela.head(10)) X = tabela[["bmi", "s3"]] X_t = X[:-20] X_v = X[-20:] print(X_t["bmi"]) y_t = tabela["Taxa"][:-20] y_v = tabela["Taxa"][-20:] regr = linear_model.LinearRegression() # treina o modelo regr.fit(X_t, y_t) # faz a predição y_pred = regr.predict(X_v)
import matplotlib.pyplot as plt import numpy as np from sklearn import datasets, linear_model # we use built in data set diabetes from sklearn from sklearn.metrics import mean_squared_error diabetes = datasets.load_diabetes() # this load data set from sklearn # print(diabetes.key()) # it will show the keys of data set # above line show this ----> # dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename']) # print(diabetes.data) # print(diabetes.DESCR) #it shows the data dataset description diabetes_x = diabetes.data[:, np.newaxis, 2] #in it : mean all and we access data of only 2 index #for x axis diabetes_x_Train = diabetes_x[:-30] # we access last 30 to train our algo diabetes_x_Test = diabetes_x[20:] # we access first 20 fro test # hum apni marzi k mutabiq b lai saktey hain 10 test and 10 train k liye i starah #for y axis diabetes_y_train = diabetes.target[:-30] diabetes_y_test = diabetes.target[20:] #for linear model model = linear_model.LinearRegression() # linearregression ko import kiya
def __init__(self): self._diabetes = datasets.load_diabetes() self._shrink_x = np.c_[ .5, 1].T self._shrink_y = [.5, 1] self._shrink_t = np.c_[ 0, 2].T self._alphas = np.logspace(-4, -1, 6)
The coefficients, the residual sum of squares and the variance score are also calculated. """ print(__doc__) # Code source: Jaques Grobler # License: BSD 3 clause import matplotlib.pyplot as plt import numpy as np from sklearn import datasets, linear_model from sklearn.metrics import mean_squared_error, r2_score # Load the diabetes dataset diabetes = datasets.load_diabetes() # Use only one feature diabetes_X = diabetes.data[:, np.newaxis, 2] # Split the data into training/testing sets diabetes_X_train = diabetes_X[:-20] diabetes_X_test = diabetes_X[-20:] # Split the targets into training/testing sets diabetes_y_train = diabetes.target[:-20] diabetes_y_test = diabetes.target[-20:] # Create linear regression object regr = linear_model.LinearRegression()
import numpy as np from sklearn import linear_model, metrics from sklearn import datasets from sklearn.metrics import r2_score diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True) diabetes_X_train = diabetes_X[:-20] diabetes_y_train = diabetes_y[:-20] diabetes_X_test = diabetes_X[-20:] diabetes_y_test = diabetes_y[-20:] regr = linear_model.LinearRegression() regr.fit(diabetes_X_train, diabetes_y_train) print(regr.coef_) diabetes_y_pred = regr.predict(diabetes_X_test) mean_square_error = metrics.mean_squared_error(diabetes_y_test, diabetes_y_pred) print('mean square error:{}'.format(mean_square_error)) print('r2 score: {}'.format(r2_score(diabetes_y_test, diabetes_y_pred)))