def svm_cross_validate_category(X, y, category, C, penalty, sample_weights): clf_svm_1 = SGDRegressor(loss=loss, penalty=penalty, epsilon=epsilon, alpha=C, shuffle=True) clf_svm_2 = SGDRegressor(loss=loss, penalty=penalty, epsilon=epsilon, alpha=C, shuffle=True) cv_indices = generate_cv_indices(category) train_ids = cv_indices[0:N] test_ids = cv_indices[N:2 * N] clf_svm_1.fit(X[train_ids, :], y[train_ids], sample_weight=sample_weights[train_ids]) clf_svm_2.fit(X[test_ids, :], y[test_ids], sample_weight=sample_weights[test_ids]) score = np.zeros(2) score[0] = clf_svm_1.score(X[test_ids, :], y[test_ids]) score[1] = clf_svm_2.score(X[train_ids, :], y[train_ids]) mean_score = np.mean(score) y_1 = clf_svm_1.decision_function(X[test_ids, :]) y_2 = clf_svm_2.decision_function(X[train_ids, :]) u, indices = np.unique(category, return_inverse=True) auc = np.zeros((2, len(u))) for i in range(0, len(u)): i_inds = indices == i if (np.sum(test_ids & i_inds) != 0): fpr, tpr, thresholds = metrics.roc_curve(y[test_ids & i_inds], y_1[i_inds[test_ids]], pos_label=1) auc[0, i] = metrics.auc(fpr, tpr) if (np.sum(train_ids & i_inds) != 0): fpr, tpr, thresholds = metrics.roc_curve(y[train_ids & i_inds], y_2[i_inds[train_ids]], pos_label=1) auc[1, i] = metrics.auc(fpr, tpr) mean_auc = np.mean(auc, axis=0) print("Finished running category cross-validation") return mean_auc
def svm_cross_validate(X, y, category, C, penalty, sample_weights): clf_svm_1 = SGDRegressor(loss=loss, penalty=penalty, epsilon=epsilon, alpha=C, shuffle=True) clf_svm_2 = SGDRegressor(loss=loss, penalty=penalty, epsilon=epsilon, alpha=C, shuffle=True) #N = len(category) #half_data= np.floor(N/2) #cv_indices_1= np.repeat([False],N) #cv_indices_2= np.repeat([False],N) #cv_indices_1[0:half_data] =True #cv_indices_2[half_data:N] =True #cv_indices= np.concatenate((cv_indices_1,cv_indices_2),axis=1) cv_indices = generate_cv_indices_unbalanced(category) train_ids = cv_indices[0:N] test_ids = cv_indices[N:2 * N] clf_svm_1.fit(X[train_ids, :], y[train_ids], sample_weight=sample_weights[train_ids]) clf_svm_2.fit(X[test_ids, :], y[test_ids], sample_weight=sample_weights[test_ids]) score = np.zeros(2) score[0] = clf_svm_1.score(X[test_ids, :], y[test_ids]) score[1] = clf_svm_2.score(X[train_ids, :], y[train_ids]) mean_score = np.mean(score) y_1 = clf_svm_1.decision_function(X[test_ids, :]) y_2 = clf_svm_2.decision_function(X[train_ids, :]) auc = np.zeros(2) fpr, tpr, thresholds = metrics.roc_curve(y[test_ids], y_1, pos_label=1) auc[0] = metrics.auc(fpr, tpr) fpr, tpr, thresholds = metrics.roc_curve(y[train_ids], y_2, pos_label=1) auc[1] = metrics.auc(fpr, tpr) mean_auc = np.mean(auc, axis=0) print("Finished running standard cross validation") return mean_auc
class RBFSamplerSGDRegressorEstimator(BaseEstimator, TransformerMixin): def __init__(self, gamma=1.0, n_components=100, random_state=None, **kwargs): kwargs['random_state'] = random_state self.rbf_sampler = RBFSampler(gamma=gamma, n_components=n_components, random_state=random_state) self.sgdregressor = SGDRegressor(**kwargs) def fit(self, X, y): X = self.rbf_sampler.fit_transform(X) self.sgdregressor.fit(X, y) return self def transform(self, X, y=None): return np.sqrt(self.rbf_sampler.n_components) / np.sqrt( 2.) * self.rbf_sampler.transform(X) def predict(self, X): return self.sgdregressor.predict(self.transform(X)) def score(self, X, y): return self.sgdregressor.score(self.transform(X), y)
def linear_model2(): """ 梯度下降法 :return: None """ # 1.获取数据 boston = load_boston() # 2.数据基本处理 # 2.1 数据集划分 x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2) # 3.特征工程 --标准化 transfer = StandardScaler() x_train = transfer.fit_transform(x_train) x_test = transfer.fit_transform(x_test) # 4.机器学习(线性回归) estimator = SGDRegressor(max_iter=1000, learning_rate="constant", eta0=0.001) estimator.fit(x_train, y_train) print("这个模型的偏置是:\n", estimator.intercept_) # 5.模型评估 # 5.1 预测值和准确率 y_pre = estimator.predict(x_test) print("预测值是:\n", y_pre) score = estimator.score(x_test, y_test) print("准确率是:\n", score) # 5.2 均方误差 ret = mean_squared_error(y_test, y_pre) print("均方误差是:\n", ret)
class support_vector_machine: _model = None def __init__(self): self._model = SGDRegressor() def train(self, data_x, data_y): self._model.fit(data_x, data_y) joblib.dump(self._model, 'svm_model.pickle') def predict(self, X): ret = self._model.predict(X) return ret def score(self, X, y): score = self._model.score(X, y) return score def load_model(self, path): path = os.path.join(os.path.dirname( os.path.abspath(__file__)), path) print(path) self._model = joblib.load(path) return self._model def get_model(self): return self._model
def runSGD(X_train, X_test, y_train, y_test, dataname): all_epsilon = [0.001, 0.1, 0.5, 0.9] best_model = None max_score = 0 for epsilon in all_epsilon: regressor = SGDRegressor(loss='epsilon_insensitive', epsilon=epsilon) regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) # plt.show() plt.scatter(y_test, y_pred) plt.plot([y_test.min(), y_test.max()], [y_pred.min(), y_pred.max()], 'r', lw=2) score = regressor.score(X_test, y_test) if score > max_score: best_model = regressor plt.title('SGD - {0}\n epsilon ={1} \nScore = {2:.3f} '.format( str(dataname), epsilon, score)) plt.xlabel('Actual ') plt.ylabel('Predict') # plt.show() plt.savefig('runSGD_{}_{}.png'.format(strftime("%H_%M_%S", gmtime()), epsilon)) plt.close() return best_model
def sgd(X, y, weight, X_test=False): from sklearn.linear_model import SGDRegressor from sklearn import cross_validation from sklearn.metrics import confusion_matrix from sklearn.preprocessing import StandardScaler #X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split( # X, y, weight, test_size=0.2, random_state=0) clf = SGDRegressor(loss="huber", n_iter=100, penalty="l1") #clf = LogisticRegression( max_iter=100) X_train = X y_train = y scaler = StandardScaler(with_mean=False) scaler.fit(X_train) # Don't cheat - fit only on training data X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # apply same transformation to test data clf.fit(X_train, y_train, sample_weight=weight) print(clf.score(X_train,y_train,weight)) y_pred = clf.predict(X_test) from sklearn.externals import joblib import scipy.io as sio joblib.dump(clf, 'models/sgd_.pkl') sio.savemat('predict_y_forward.mat', {'y':y_pred})
def predict(self, df): # get time frame time_frame = settings.time_frame # copy of data df_copy = df.copy() from sklearn.linear_model import SGDRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error # partition data X_train, y_train, X_val, y_val, X_test, y_test = self.partition(df_copy) # normalize features X_train_std, X_val_std, X_test_std = self.feature_scale(X_train, X_val, X_test) # instance of Linear Regression classifier lr = SGDRegressor() # fit model lr.fit(X_train_std, y_train) # predictions on validation set predictions = lr.predict(X_val_std) # R^2 score score = lr.score(X_val_std, y_val) # error test_error = (mean_squared_error(y_val, predictions)**.5) print test_error
def predict(self, df): # get time frame time_frame = settings.time_frame # copy of data df_copy = df.copy() from sklearn.linear_model import SGDRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error # partition data X_train, y_train, X_val, y_val, X_test, y_test = self.partition( df_copy) # normalize features X_train_std, X_val_std, X_test_std = self.feature_scale( X_train, X_val, X_test) # instance of Linear Regression classifier lr = SGDRegressor() # fit model lr.fit(X_train_std, y_train) # predictions on validation set predictions = lr.predict(X_val_std) # R^2 score score = lr.score(X_val_std, y_val) # error test_error = (mean_squared_error(y_val, predictions)**.5) print test_error
def test_call_fit_with_arguments_score_does_not_accept(): mlflow.sklearn.autolog() from sklearn.linear_model import SGDRegressor assert "intercept_init" in _get_arg_names(SGDRegressor.fit) assert "intercept_init" not in _get_arg_names(SGDRegressor.score) mock_obj = mock.Mock() def mock_score(self, X, y, sample_weight=None): # pylint: disable=unused-argument mock_obj(X, y, sample_weight) return 0 assert inspect.signature( SGDRegressor.score) == inspect.signature(mock_score) SGDRegressor.score = mock_score model = SGDRegressor() X, y = get_iris() with mlflow.start_run() as run: model.fit(X, y, intercept_init=0) mock_obj.assert_called_once_with(X, y, None) run_id = run.info.run_id params, metrics, tags, artifacts = get_run_data(run_id) assert params == truncate_dict( stringify_dict_values(model.get_params(deep=True))) assert {TRAINING_SCORE: model.score(X, y)}.items() <= metrics.items() assert tags == get_expected_class_tags(model) assert MODEL_DIR in artifacts assert_predict_equal(load_model_by_run_id(run_id), model, X)
def SGD(x, y, test_x, test_y, loss="squared_loss", penalty="l1", alpha=0.0001, tol=0.001, random_state=1, eta0=0.01, learning_rate='optimal', power_t=0.25, max_iter=1000): sr = SGDRegressor(loss=loss, penalty=penalty, alpha=alpha, tol=tol, random_state=random_state, eta0=eta0, learning_rate=learning_rate, power_t=power_t, max_iter=max_iter) sr.partial_fit(x, y) y_pred_undersample = sr.predict(test_x) y_pred_undersample[(y_pred_undersample > 0.5)] = 1 y_pred_undersample[(y_pred_undersample <= 0.5)] = 0 i = sr.n_iter_ Score = sr.score(test_x, test_y) F1 = f1_score(test_y, y_pred_undersample) P = precision_score(test_y, y_pred_undersample) R = recall_score(test_y, y_pred_undersample) tn, fp, fn, tp = confusion_matrix(test_y, y_pred_undersample).ravel() return Score, F1, P, R, tn, fp, fn, tp, i
def SGDTrain(): model = SGDRegressor() # model.fit(x_train_standard, y_train) # print(model.coef_) # print(model.intercept_) data_generator = get_batch(x_train_standard, y_train) sgd_curve_x = [] sgd_curve_y = [] for i in range(epochs): # Train for 100 epochs x, y = next(data_generator) # print(x) # print(y) model.partial_fit(x, y) # print(model.score(x_test_standard, y_test)) # print(model.coef_) sgd_curve_x.append(i) sgd_curve_y.append(model.score(x_test_standard, y_test)) predicted = model.predict(x_test_standard) plt.title('SGD result (4000 epochs)') plt.scatter(y_test, predicted, color='y', marker='o') plt.plot(y_test, y_test, color='g') plt.xlabel('True value') plt.ylabel('Predicted value') # plt.savefig('./4000_1.png') plt.show() print('SGD RMSE为:', np.sqrt(mean_squared_error(y_test, predicted))) return sgd_curve_x, sgd_curve_y
def mylinear(): ''' 线性回归预测房价 :return: ''' #获取数据 lb = load_boston() #分割数据集到训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25) #进行标准化 #特征值和目标值是都必须进行标准化处理,实例化两个标准化API std_x = StandardScaler() x_train = std_x.fit_transform(x_train) x_test = std_x.transform(x_test) std_y = StandardScaler() y_train = std_y.fit_transform(y_train.reshape(-1, 1)) y_test = std_y.transform(y_test.reshape(-1, 1)) #estimator预测 #正规方程求解方式预测结果 lr = LinearRegression() lr.fit(x_train, y_train) print(lr.coef_) #预测测试集的房子价格 y_lr_preidct = std_y.inverse_transform(lr.predict(x_test)) print("正规方程测试集里面每个房子的价格:", y_lr_preidct) print("分数:", lr.score(x_test, y_test)) print("正规方程的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), y_lr_preidct)) # 梯度下降求解方式预测结果 sgd = SGDRegressor() sgd.fit(x_train, y_train) print(sgd.coef_) # 预测测试集的房子价格 y_sgd_preidct = std_y.inverse_transform(sgd.predict(x_test)) print("梯度下降测试集里面每个房子的价格:", y_sgd_preidct) print("分数:", sgd.score(x_test, y_test)) print("梯度下降的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_preidct)) # 岭回归求解方式预测结果 rd = Ridge() rd.fit(x_train, y_train) print(rd.coef_) # 预测测试集的房子价格 y_rd_preidct = std_y.inverse_transform(rd.predict(x_test)) print("梯度下降测试集里面每个房子的价格:", y_rd_preidct) print("分数:", rd.score(x_test, y_test)) print("梯度下降的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), y_rd_preidct))
def linear(): # 获取数据,分割数据 lb = load_boston() x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25, random_state=24) # 标准化处理,对特征值和目标值都进行处理 std_x = StandardScaler() x_train = std_x.fit_transform(x_train) x_test = std_x.transform(x_test) # 目标值先转为二位数组,并且转置,然后标准化 y_train = np.array([y_train]) y_test = np.array([y_test]) std_y = StandardScaler() # 每一行的元素需要个数一致 y_train = std_y.fit_transform(y_train.T) y_test = std_y.transform(y_test.T) print('--------正规方程--------') # 应用线性回归分析-正规方程 lr = LinearRegression() lr.fit(x_train, y_train) # 标准化之前的数据大小!!!!!! # lr_predict = std_y.inverse_transform(lr.predict(x_test)) # print("预测结果:", lr.predict(x_test)) # print("真实结果:", y_test) # print("参数/系数:", lr.coef_) print("测试集准确率:", lr.score(x_test, y_test)) # 如果目标值集合不标准化使用下面这一句 # print("正规方程的均方误差:", mean_squared_error(y_test, lr.predict(x_test))) # 目标值集合标准化需要把标准化之前的真实数据算出来 # 标准化之前的数据大小!!!!!! lr_predict = std_y.inverse_transform(lr.predict(x_test)) print("正规方程的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), lr_predict)) print('--------SGD梯度下降--------') # SGD梯度下降 sgd = SGDRegressor() sgd.fit(x_train, y_train) print("测试集准确率:", sgd.score(x_test, y_test)) print("参数/系数:", sgd.coef_) # 均方误差 sgd_predict = std_y.inverse_transform(sgd.predict(x_test)) print("梯度下降的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), sgd_predict)) print('--------岭回归分析--------') # 岭回归分析 r = Ridge(alpha=3.0) r.fit(x_train, y_train) print("测试集准确率:", r.score(x_test, y_test)) print("参数/系数:", r.coef_) # 均方误差 r_predict = std_y.inverse_transform(r.predict(x_test)) print("岭回归的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), r_predict))
def trainModel(ModelType, X, y): if ModelType == SGDRegressor: model = SGDRegressor(loss='epsilon_insensitive', max_iter=100) else: model = ModelType() model.fit(X, y) accuracy = model.score(X, y) print("Model training score: {}".format(accuracy)) return model
def run(): iterations = 10001 learning_rate = 0.01 X_train, Y_train = readTrainingData() X_test = readTestingData() scaler = StandardScaler() scaler.fit_transform(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) clf = SGDRegressor(n_iter=100) #clf = AdaGradRegressor(n_iter=100) clf.fit(X_train, Y_train) print clf.score(X_train, Y_train) predict = clf.predict(X_test) write_to_file(predict)
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ clf = SGDRegressor(n_iter=100) clf.fit(features,values) print(clf.score(features,values)) intercept = clf.intercept_ params = clf.coef_ return intercept, params
def train(training_pandas_data, test_pandas_data, label_col, feat_cols, alpha, l1_ratio, max_iter, tol, training_data_path, test_data_path): print("train: " + training_data_path) print("test: " + test_data_path) print("alpha: ", alpha) print("l1-ratio: ", l1_ratio) print("max_iter: ", max_iter) print("tol: ", tol) print("label-col: " + label_col) for col in feat_cols: print("feat-cols: " + col) # Split data into training labels and testing labels. trainingLabels = training_pandas_data[label_col].values trainingFeatures = training_pandas_data[feat_cols].values testLabels = test_pandas_data[label_col].values testFeatures = test_pandas_data[feat_cols].values #We will use an SGD model. en = SGDRegressor(alpha=alpha, l1_ratio=l1_ratio, warm_start=True, max_iter=max_iter, tol=tol) # Here we train the model. en.fit(trainingFeatures, trainingLabels) # Calculating the scores of the model. test_rmse = mean_squared_error(testLabels, en.predict(testFeatures))**0.5 r2_score_training = en.score(trainingFeatures, trainingLabels) r2_score_test = en.score(testFeatures, testLabels) print("Test RMSE:", test_rmse) print("Training set score:", r2_score_training) print("Test set score:", r2_score_test) #Logging the RMSE and r2 scores. mlflow.log_metric("Test RMSE", test_rmse) mlflow.log_metric("Train R2", r2_score_training) mlflow.log_metric("Test R2", r2_score_test) #Saving the model as an artifact. sklearn.log_model(en, "model")
def SGDRegressor_test(): sgdr = SGDRegressor(max_iter=1000) sgdr.fit(X_train, y_train.ravel()) sgdr_y_predict = sgdr.predict(X_test) print("快速的随机梯度下降模型评价:{}".format(sgdr.score(X_test, y_test))) print("使用R-squared评价标准:{}".format(r2_score(y_test, sgdr_y_predict))) print("使用MAE评价标准:{}".format( mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_predict)))) print("使用MSE评价标准:{}".format( mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_predict))))
def sgdregressor_sk(x_train, y_train, x_test, y_test, epochs): #Create the SGD regressor with best hyperparameters regressor = SGDRegressor(eta0=2, power_t=0.3, max_iter=epochs) #fit the data regressor.fit(x_train, y_train) #predict the prices y_pred = regressor.predict(x_test) #get the accuarcy by r2 acc = regressor.score(x_test, y_test) return y_pred, acc
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ model = SGDRegressor(n_iter=100) model.fit(features,values) print 'SCORE: ',model.score(features,values) intercept = model.intercept_ params = model.coef_ return intercept, params
def lineaReg(): boston = loadDataSet() X = boston.data y = boston.target.reshape((len(boston.target), 1)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) print('The max target value is: ', np.max(boston.target)) print('The min target value is: ', np.min(boston.target)) print('The average target value is: ', np.mean(boston.target)) ss_x = StandardScaler() ss_y = StandardScaler() X_train = ss_x.fit_transform(X_train) X_test = ss_x.transform(X_test) y_train = ss_y.fit_transform(y_train) y_test = ss_y.transform(y_test) lr = LinearRegression() lr.fit(X_train, y_train) lr_y_predict = lr.predict(X_test) sgdr = SGDRegressor() sgdr.fit(X_train, y_train) sgdr_y_predcit = sgdr.predict(X_test) # 回归问题的评估方法 print('dafault value of LR: ', lr.score(X_test, y_test)) print('R-squared of LR: ', r2_score(y_test, lr_y_predict)) print( 'Mean squared error of LR: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict))) print( 'Mean absoluate of LR: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict))) print('------------------------------------------------------------------') print('dafault value of SGDR: ', sgdr.score(X_test, y_test)) print('R-squared of SGDR: ', r2_score(y_test, sgdr_y_predcit)) print( 'Mean squared error of SGDR: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_predcit))) print( 'Mean absoluate of SGDR: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_predcit))) return None
def sgd(pd, pl, qd, ql): params = {'loss':['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'], 'alpha':expon(scale=1), 'epsilon':expon(scale=1), 'l1_ratio':uniform(), 'penalty':[ 'l2', 'l1', 'elasticnet']} clf = SGDRegressor() #clf = RandomizedSearchCV(clf, params, n_jobs=2, n_iter=10, verbose=10) print("Training Linear SVM Randomly") clf.fit(pd, pl) print("Score: " + str(clf.score(qd, ql))) return clf
def sgd_regression(slef, x, y, prediction_set): ''' Perfoms SGD regression by taking x, and y and return the fit model. Attributes need to be called on sgd_regression coef_, intercept_ ,average_coef_ : array, shape (n_features,) ,average_intercept_, n_iter_ : int''' regr = SGDRegressor(max_iter=1000, tol=1e-3) # SGDRegressor(alpha=0.0001, average=False, early_stopping=False, # epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, # learning_rate='invscaling', loss='squared_loss', max_iter=1000, # n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None, # shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, # warm_start=False) model = regr.fit(x, y) if isinstance(prediction_set, np.ndarray): y_pred = model.predict(prediction_set) return [regr.score(x, y), regr.predict(prediction_set)] else: return [regr.score(x, y)]
def player_prediction(name): data = pd.read_csv('../resources/newMERGED.csv', sep=',', encoding='utf-8', index_col=0) model = data[[ 'player_id', 'name', 'season', 'pos', 'round', 'team_rank', 'opponent_team_rank', 'team_pot', 'opp_pot', 'concede_pot', 'opp_concede_pot', 'prev_points', 'form_points', 'total_points', 'long_form', 'ict_form' ]] MidfielderModal = model.loc[model['pos'] == 'Midfielder'] MidfielderModal.drop('pos', axis=1, inplace=True) MidfielderModal.sort_values(['season', 'round'], ascending=True, inplace=True) # MidfielderModal.to_csv('../temp/MIDFIELDERS.csv', sep=',', encoding='utf-8') players = MidfielderModal[7959:] keys = MidfielderModal['round'] values = pd.cut(MidfielderModal['round'], 3, labels=[1, 2, 3]) dictionary = dict(zip(keys, values)) MidfielderModal['round'] = values X = MidfielderModal.drop(['total_points', 'season', 'player_id', 'name'], axis=1) y = MidfielderModal[['total_points']] X_train = X[:7958] X_test = X[7959:] y_train = y[:7958] y_test = y[7959:] regression_model = SGDRegressor() regression_model.fit(X_train, y_train) score = regression_model.score(X_test, y_test) y_pred = regression_model.predict(X_test) testing = pd.concat([X_test, y_test], 1) testing['Predicted'] = np.round(y_pred, 1) testing[ 'Prediction_Error'] = testing['total_points'] - testing['Predicted'] testing['player_id'] = 0 testing['name'] = 0 testing['player_id'] = players.player_id testing['name'] = players.name print(testing[testing['name'] == name])
def SGD(x_train,x_test,y_train,y_test): # 梯度下降法 # 自我修正的线性模型,默认学习率为0.01 这个就是碗的下滑速度 # 梯度方向,就是你朝着偏离正下滑的多少角度向碗底迂回前进的方向 sgd = SGDRegressor() sgd.fit(x_train,y_train) predict=sgd.predict(x_test) score = sgd.score(x_test,y_test) print(predict) print(score) k=sgd.coef_ b=sgd.intercept_ return k,b
def runSGDRegressor(self): lm = SGDRegressor(loss='squared_loss', penalty='l2', fit_intercept=True) print("SGDRegressor\n") lm.fit(self.m_X_train, self.m_y_train) predictY = lm.predict(self.m_X_test) score = lm.score(self.m_X_test, self.m_y_test) predictTraingY = lm.predict(self.m_X_train) self.displayPredictPlot(predictY) self.displayResidualPlot(predictY, predictTraingY) self.dispalyModelResult(lm, predictY, score)
def SGDRegressor_pred(X_train, X_test, y_train_normalized, y_train_mean, y_test): # The learning rate: # ---constant: eta = eta0 [assign to the initial one, eta0] # ---optimal: eta = 1.0/(t+t0) # ---invscaling: eta = eta0 / pow(t, power_t) [default] clf = SGDRegressor(alpha=0.0001, eta0=0.001, n_iter=150, fit_intercept=False, shuffle=True, verbose=0) clf = clf.fit(X_train, y_train_normalized) # Conveting to back, (could be used sklearn standardization function for both decoding and encoding) predictions_train = clf.predict(X_train) + y_train_mean predictions = clf.predict(X_test) + y_train_mean score_test = clf.score(X_test, y_test) return predictions, predictions_train, score_test
def SGD(task, data, split=0.3, lr="optimal", alpha=0.0001, seed=42): task = task.lower() X = data["X"] y = data["y"] if task == "r" or task == "reg" or task == "regression": sgd = SGDRegressor(learning_rate=lr, alpha=alpha, random_state=seed) elif task == "c" or task == "classify" or task == "classification": sgd = SGDClassifier(learning_rate=lr, alpha=alpha, random_state=seed) else: raise NameError('Task should be either regression or classification') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=seed) sgd.fit(X_train, y_train) train_preds = sgd.score(X_train, y_train) print("Boosting Training Accuracy: " + str(train_preds * 100) + "%") preds = sgd.score(X_test, y_test) print("Boosting Testing Accuracy: " + str(preds * 100) + "%") return sgd
def main(): # setting up our data and removing unwanted instances with open('auto_mobile_data.csv') as file: data = list(csv.reader(file)) data = setup.remove_inst(data) # creating a set of attributes to skip skip_atr = setup.skip_attribute(data) # adding attributes with no correlation to skip_atr (see scatter_plots\analysis.txt) # remove: symboling, losses, car height, bore, stroke, compression ratio, peak rpm skip_atr.update({0, 1, 12, 18, 19, 20, 22, 25}) # dictionary of the mean of values of attributes mean_nums = setup.missing_values(data) # arranging the x and y data # x is a 2d list and y is 1d x = [] y = [float(i[len(i) - 1]) for i in data] for i in data: thing = [ i[val] if i[val] != '?' else mean_nums[val] for val in range(len(i)) ] x.append([ float(thing[val]) for val in range(len(thing)) if val not in skip_atr ]) # splitting our data into training and testing data train_x, train_y, test_x, test_y = train_test(x, y) # preparing data for regression x = np.array(train_x) y = np.array(train_y) x = minmax_scale(x) y = minmax_scale(y) # fitting data to our model sgd = SGDRegressor().fit(x, y) # scoring our model # we must score our model with unseen data to prevent over/under fitting test_x = minmax_scale(test_x) test_y = minmax_scale(test_y) print('score of model: ', sgd.score(test_x, test_y))
def consider_SGD(): performances: List[ExperimentResult] = [] for rnd in range(3): for penal in ["l1","l2","elasticnet"]: for los in ["squared_loss","huber"]: params = { "random_state": rnd, "penalty": penal, "max_iter": 100, "loss":los } f = SGDRegressor(**params) f.fit(X_train, y_train) vali_acc = f.score(X_vali, y_vali) result = ExperimentResult(vali_acc, params, f) performances.append(result) return min(performances, key=lambda result: result.vali_acc)
def SGD_boston(): boston = load_boston() x = boston.data y = boston.target train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=.25) std_s = StandardScaler() train_x = std_s.fit_transform(train_x) test_x = std_s.fit_transform(test_x) sgd = SGDRegressor() sgd.fit(train_x, train_y) score = sgd.score(test_x, test_y) predict_y = sgd.predict(test_x) print(score) print(predict_y[:20]) print(test_y[:20]) # print(sgd.coef_) # print(sgd.intercept_) return None
def SDGRegressionExample(): import numpy as np from sklearn.datasets import load_boston from sklearn.linear_model import SGDRegressor from sklearn.cross_validation import cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split data = load_boston() X_train, X_test, y_train, y_test = train_test_split(data.data,data.target) X_scaler = StandardScaler() y_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) y_train = y_scaler.fit_transform(y_train) X_test = X_scaler.transform(X_test) y_test = y_scaler.transform(y_test) regressor = SGDRegressor(loss='squared_loss') scores = cross_val_score(regressor, X_train, y_train, cv=5) print 'Cross validation r-squared scores:', scores print 'Average cross validation r-squared score:', np.mean(scores) regressor.fit_transform(X_train, y_train) print 'Test set r-squared score', regressor.score(X_test, y_test)
def get_sgd(X_train, X_test, y_train, y_test): temp_max_itr = 100000 dest_eta = 1e-5 dest_tol = 1e-3 temp_coef = 0.01 dest_coef = temp_coef dest_intercept = 0.0 max = -1000 # mode = 'w' # cnt = 1 while temp_coef <= 2.0: temp_intercept = 0.0 while temp_intercept <= 50.0: sgd = SGDRegressor(random_state=15, max_iter=temp_max_itr, eta0=dest_eta, tol=dest_tol, n_iter_no_change=6) sgd.fit(X_train, y_train, coef_init=temp_coef, intercept_init=temp_intercept) scr = sgd.score(X_test, y_test) #Checking if scored more than previous max score if max < scr: max = scr dest_coef = temp_coef dest_intercept = temp_intercept # if cnt > 1 : # mode = 'a' # cnt += 1 # write_to_file(scr,dest_coef, dest_intercept, mode) temp_intercept += 1.0 temp_coef += 0.1 sgd1 = SGDRegressor(random_state=15, max_iter=temp_max_itr, eta0=dest_eta, tol=dest_tol, n_iter_no_change=6) return sgd1, dest_coef, dest_intercept
class SGDRegressionModel(RegressionModel): def __init__(self, train_data): RegressionModel.__init__(self, train_data) self.model = SGDRegressor() def train(self, x=None, y=None): x = x if x is not None else self.train_x y = y if y is not None else self.train_y self.model.fit(x, y) def predict(self, x_in): return self.model.predict(x_in) def evaluate(self, x_in, y_out): return self.model.score(x_in, y_out) def save(self, filename): joblib.dump(self.model, filename) def load(self, filename): self.model = joblib.load(filename)
def test_both_fit_and_score_contain_sample_weight(sample_weight_passed_as): mlflow.sklearn.autolog() from sklearn.linear_model import SGDRegressor # ensure that we use an appropriate model for this test assert "sample_weight" in _get_arg_names(SGDRegressor.fit) assert "sample_weight" in _get_arg_names(SGDRegressor.score) mock_obj = mock.Mock() def mock_score(self, X, y, sample_weight=None): # pylint: disable=unused-argument mock_obj(X, y, sample_weight) return 0 assert inspect.signature( SGDRegressor.score) == inspect.signature(mock_score) SGDRegressor.score = mock_score model = SGDRegressor() X, y = get_iris() sample_weight = abs(np.random.randn(len(X))) with mlflow.start_run() as run: if sample_weight_passed_as == "positional": model.fit(X, y, None, None, sample_weight) elif sample_weight_passed_as == "keyword": model.fit(X, y, sample_weight=sample_weight) mock_obj.assert_called_once_with(X, y, sample_weight) run_id = run.info.run_id params, metrics, tags, artifacts = get_run_data(run_id) assert params == truncate_dict( stringify_dict_values(model.get_params(deep=True))) assert {TRAINING_SCORE: model.score(X, y)}.items() <= metrics.items() assert tags == get_expected_class_tags(model) assert MODEL_DIR in artifacts assert_predict_equal(load_model_by_run_id(run_id), model, X)
def SGDDemo(): import numpy as np from sklearn.datasets import load_boston from sklearn.linear_model import SGDRegressor from sklearn.cross_validation import cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split data = load_boston() X_train,X_test,y_train,y_test = train_test_split(data.data,data.target) X_scaler = StandardScaler() y_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) y_train = y_scaler.fit_transform(y_train) X_test = X_scaler.transform(X_test) y_test = y_scaler.transform(y_test) regressor = SGDRegressor(loss='squared_loss') scores = cross_val_score(regressor,X_train,y_train,cv=5) print "Cross validation r-sqr ",np.mean(scores) regressor.fit_transform(X_train,y_train) print "TEST score :",regressor.score(X_test,y_test)
Y_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) Y_train = Y_scaler.fit_transform(Y_train) X_test = X_scaler.transform(X_test) Y_test = Y_scaler.transform(Y_test) print X_train[0:5] print len(X_train) print Y_test clf =SGDRegressor(loss="squared_loss") scores = cross_val_score(clf,X_train,Y_train,cv=5) print scores print np.mean(scores) clf.fit_transform(X_train,Y_train) pred = clf.predict(X_test) print clf.score(X_test,Y_test) # correlation(X_train,Y_train) # feature_selection(X_train,Y_train) scatter_plot(X_train,Y_train)
import numpy as np from sklearn.datasets import load_boston from sklearn.linear_model import SGDRegressor from sklearn.cross_validation import cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split data = load_boston() print(data) x_train,x_test,y_train,y_test = train_test_split(data.data,data.target) x_scaler = StandardScaler() y_scaler = StandardScaler() x_train=x_scaler.fit_transform(x_train) y_train=y_scaler.fit_transform(y_train) x_test = x_scaler.transform(x_test) y_test=y_scaler.transform(y_test) regressor = SGDRegressor(loss='squared_loss') scores=cross_val_score(regressor,x_train,y_train,cv=5) print('Cross Validation r-squared scores:',scores) print('Average cross validation r-squared score',np.mean(scores)) regressor.fit_transform(x_train,y_train) print('Test set r-squared score',regressor.score(x_test,y_test))
X_test = X[test] y_train = y[train] y_test = y[test] #iris = datasets.load_iris() #print(iris.data) #X = iris.data[:,0:3] #y = iris.data[:,3] start_time = time.time() sgd = SGDRegressor(alpha=0.01, average=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', n_iter=1000, penalty='l2', power_t=0.25, random_state=None, shuffle=True, verbose=0, warm_start=False) sgd.fit(X_train.astype('float64'),y_train) elapsed_time = time.time() - start_time print("Time %s"%elapsed_time) print(sgd.coef_,sgd.intercept_) print("Accuracy %s"%sgd.score(X_test,y_test)) start_time = time.time() psgd = ParallelSGDRegressor(alpha=0.01, average=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', n_iter=1000, penalty='l2', power_t=0.25, random_state=None, shuffle=True, verbose=0, warm_start=False) psgd.fit(X_train.astype('float64'), y_train) elapsed_time = time.time() - start_time print("Time %s" %elapsed_time) print(psgd.coef_,psgd.intercept_) print("Accuracy %s"%psgd.score(X_test,y_test))
import numpy as np from sklearn.datasets import load_boston from sklearn.linear_model import SGDRegressor from sklearn.cross_validation import cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split # load and split data data = load_boston() X_train, X_test, y_train, y_test = train_test_split(data.data, data.target) # scale the features X_scaler = StandardScaler() y_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) y_train = y_scaler.fit_transform(y_train) X_test = X_scaler.fit_transform(X_test) y_test = y_scaler.fit_transform(y_test) # train regressor = SGDRegressor(loss='squared_loss') scores = cross_val_score(regressor, X_train, y_train, cv=5) print('Cross validation r-squared scores: {0}'.format(scores)) print('Average cross validation r-squared score: {0}'.format(np.mean(scores))) regressor.fit_transform(X_train, y_train) print('Test set r-squared score: {0}'.format(regressor.score(X_test, y_test)))
# we wont use causal and registered in tutorial df.drop(['registered', 'casual'], axis=1, inplace=True) df.drop(['datetime'], axis=1, inplace=True) from sklearn.cross_validation import train_test_split # X are our features without 'count' X = df.drop(['count'], axis=1) # y is the target 'count' y = df['count'] X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.70, random_state=2) from sklearn.preprocessing import StandardScaler from sklearn.linear_model import SGDRegressor X_train_transformed = StandardScaler().fit_transform(X_train) X_test_transformed = StandardScaler().fit_transform(X_test) clf = SGDRegressor() # train you model clf.fit(X_train_transformed, y_train) # print out predicted values print(clf.predict(X_test_transformed)) # print out how well the model fits for data print("Model Score: ", clf.score(X_train_transformed, y_train)) from sklearn.pipeline import make_pipeline clf = make_pipeline(StandardScaler(), SGDRegressor()) clf.fit(X_train, y_train) print("Model Score: ", clf.score(X_train, y_train))
for idx in range(int(np.ceil(n_trainSamples / mini_batch))): x_batch = train[ind[idx * mini_batch: min((idx + 1) * mini_batch, n_trainSamples)]] y_batch = train_target[ind[idx * mini_batch: min((idx + 1) * mini_batch, n_trainSamples)]] if idx > 0: validationScore.append(clf.score(x_batch, y_batch)) clf.partial_fit(x_batch, y_batch) if idx > 0: trainScore.append(clf.score(x_batch, y_batch)) plt.plot(trainScore, label="train score") plt.plot(validationScore, label="validation socre") plt.xlabel("Mini_batch") plt.ylabel("Score") plt.legend(loc='best') plt.title(title) sgd_regresor = SGDRegressor(penalty='l2',alpha=0.001) plot_learning(sgd_regresor,"SGDRegressor") test = test_subset.drop(['EbayID','Price','SellerName'], axis=1) test = scaler.fit_transform(test) test_target = test_subset['Price'] print("SGD regressor prediction result on testing data: %f" % sgd_regresor.score(test,test_target)) plt.show()
X_scaler = StandardScaler() y_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) y_train = y_scaler.fit_transform(y_train) X_test = X_scaler.transform(X_test) y_test = y_scaler.transform(y_test) regressor = SGDRegressor(loss='squared_loss') score = cross_val_score(regressor, X_train, y_train, cv=5) print score print np.mean(score) regressor.fit_transform(X_train, y_train) print regressor.score(X_test, y_test)
y.append(float(s_data[4])) y2.append(float(s_data[1])) pprint.pprint("Training the supervised learning model... Fit on training data") print ("=========================================") try: clf = SGDRegressor(loss="huber") pprint.pprint(clf.fit(X, y)) except: raise try: clf2 = SGDRegressor(loss="huber") pprint.pprint(clf2.fit(X, y2)) except: raise print ("=========================================") print "Model testing itself! Confidence score on the training data used to construct:", clf.score(X, y) pprint.pprint("Ready to predict") print ("=========================================") pprint.pprint("Testing with test data...") test_data = list() test_diff = list() predict_diff = list() for index in test_indices: tmp = data[index][1:5] my_tmp = list() for item in tmp: my_tmp.append(float(item)) test_data.append(my_tmp)
class Model(object): def __init__(self, params): self.model_class = params['class'] self.model = {} self.feature_constructor = None self.all_possible_decisions = [] self.X = [] self.y = [] self.buffer = 0 def initialize(self): if self.model_class == 'scikit': self.model = SGDRegressor(loss='squared_loss', alpha=0.1, n_iter=10, shuffle=True, eta0=0.0001) self.feature_constructor = FeatureHasher(n_features=200, dtype=np.float64, non_negative=False, input_type='dict') elif self.model_class == 'lookup': self.model = {} def clean_buffer(self): self.X = [] self.y = [] self.buffer = 0 def return_design_matrix(self, all_decision_states, reward=None): if self.model_class == 'lookup_table': return all_decision_states, reward elif self.model_class == 'scikit': X, y = [], [] for decision_state in all_decision_states: information, decision_taken = decision_state tr = {} tr['-'.join([str(information[1]), decision_taken])] = 1 tr['-'.join([str(information[0]), decision_taken])] = 1 tr['-'.join([str(information[0]), str(information[1]), decision_taken])] = 1 X.append(tr) y.extend([reward]) X = self.feature_constructor.transform(X).toarray() return X, y def fit(self, X, y): if self.model_class == 'scikit': # X, y = self.shuffle_data(X, y) self.model.partial_fit(X, y) print self.model.score(X, y) if self.model_class == 'lookup_table': for decision_state in X: if decision_state not in self.model: for d in self.all_possible_decisions: self.model[(decision_state[0], d)] = DecisionState() self.model[decision_state].count += 1 updated_value = self.model[decision_state].value_estimate + (1.0 / self.model[decision_state].count) * ( y - self.model[decision_state].value_estimate) self.model[decision_state].value_estimate = updated_value def predict(self, X): if self.model_class == 'scikit': return self.model.predict(X) if self.model_class == 'lookup_table': if X not in self.model: for d in self.all_possible_decisions: self.model[(X[0], d)] = DecisionState() return self.model[X].value_estimate @staticmethod def shuffle_data(a, b): assert len(a) == len(b) p = np.random.permutation(len(a)) return a[p], b[p]
import numpy as np from sklearn.datasets import load_boston from sklearn.linear_model import SGDRegressor from sklearn.cross_validation import cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split data = load_boston() # print data X_train, X_test, y_train, y_test = train_test_split(data.data, data.target) X_scaler = StandardScaler() y_scaler = StandardScaler() # print X_train X_train = X_scaler.fit_transform(X_train) y_train = y_scaler.fit_transform(y_train) # print X_train X_test = X_scaler.fit_transform(X_test) y_test = y_scaler.fit_transform(y_test) regressor = SGDRegressor(loss='squared_loss') scores = cross_val_score(regressor, X_train, y_train, cv=5) print X_train.shape print "CV ", scores print regressor.fit_transform(X_train, y_train).shape print "Test r-ss", regressor.score(X_test, y_test)
def fit_SGD(features_train, labels_train, features_pred): model = SGDRegressor() model.fit(features_train, labels_train) labels_pred = model.predict(features_pred) print "SGD - coefficient of determination R^2 of the prediction: ", model.score(features_train, labels_train) return labels_pred