def predict_age(): mask = ~np.isnan(train["Age"]) age_train = train[mask] age_test = train[~mask] features = [] features.append(embarked_enc.transform(age_train["Embarked"])) features.append(sex_enc.transform(age_train["Sex"])) features.append(title_enc.transform(age_train["Title"])) features.append(pclass_enc.transform(age_train["Pclass"])) age_clf = SGDRegressor() X = np.hstack(features) y = np.array(train["Age"][mask]).T age_clf.fit(X, y) features = [] features.append(embarked_enc.transform(age_test["Embarked"])) features.append(sex_enc.transform(age_test["Sex"])) features.append(title_enc.transform(age_test["Title"])) features.append(pclass_enc.transform(age_test["Pclass"])) ages = age_clf.predict(np.hstack(features)) j = 0 for i in range(len(train)): if ~mask[i]: train.loc[i, "Age"] = ages[j] j += 1
def ls_sklearn_sgd(x, y): # Parameter estimation by sklearn SGD sgd = SGDRegressor(fit_intercept=True) sgd.fit(x.reshape((N, 1)), y) beta_0_sk = sgd.intercept_ beta_1_sk = sgd.coef_[0] return beta_0_sk, beta_1_sk
def sgd(X, y, weight, X_test=False): from sklearn.linear_model import SGDRegressor from sklearn import cross_validation from sklearn.metrics import confusion_matrix from sklearn.preprocessing import StandardScaler #X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split( # X, y, weight, test_size=0.2, random_state=0) clf = SGDRegressor(loss="huber", n_iter=100, penalty="l1") #clf = LogisticRegression( max_iter=100) X_train = X y_train = y scaler = StandardScaler(with_mean=False) scaler.fit(X_train) # Don't cheat - fit only on training data X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # apply same transformation to test data clf.fit(X_train, y_train, sample_weight=weight) print(clf.score(X_train,y_train,weight)) y_pred = clf.predict(X_test) from sklearn.externals import joblib import scipy.io as sio joblib.dump(clf, 'models/sgd_.pkl') sio.savemat('predict_y_forward.mat', {'y':y_pred})
def predict(self, df): # get time frame time_frame = settings.time_frame # copy of data df_copy = df.copy() from sklearn.linear_model import SGDRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error # partition data X_train, y_train, X_val, y_val, X_test, y_test = self.partition(df_copy) # normalize features X_train_std, X_val_std, X_test_std = self.feature_scale(X_train, X_val, X_test) # instance of Linear Regression classifier lr = SGDRegressor() # fit model lr.fit(X_train_std, y_train) # predictions on validation set predictions = lr.predict(X_val_std) # R^2 score score = lr.score(X_val_std, y_val) # error test_error = (mean_squared_error(y_val, predictions)**.5) print test_error
def predictScores(trainFeatures,trainTargets,testFeatures,testItemIds,isRegression = False): logging.info("Feature preparation done, fitting model...") predicted_scores = [] if isRegression: clf = SGDRegressor( penalty="l2", alpha=1e-4) print("trainFeatures rows::"+str(trainFeatures.shape[0])) print("trainTargets rows::"+str(len(trainTargets))) clf.fit(trainFeatures,trainTargets) logging.info("Predicting...") predicted_scores = clf.predict(testFeatures) else: clf = SGDClassifier( loss="log", penalty="l2", alpha=1e-4, class_weight="auto") print("trainFeatures rows::"+str(trainFeatures.shape[0])) print("trainTargets rows::"+str(len(trainTargets))) clf.fit(trainFeatures,trainTargets) logging.info("Predicting...") predicted_scores = clf.predict_proba(testFeatures).T[1] logging.info("Write results...") output_file = "avito_starter_solution.csv" logging.info("Writing submission to %s" % output_file) f = open(os.path.join(dataFolder,output_file), "w") f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write("%d\n" % (item_id)) f.close()
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ clf = SGDRegressor(n_iter = 20) clf.fit(features, values) intercept,params = clf.intercept_,clf.coef_ return intercept, params
def slim_train(A, l1_reg=0.001, l2_reg=0.0001): """ Computes W matrix of SLIM This link is useful to understand the parameters used: http://web.stanford.edu/~hastie/glmnet_matlab/intro.html Basically, we are using this: Sum( yi - B0 - xTB) + ... As: Sum( aj - 0 - ATwj) + ... Remember that we are wanting to learn wj. If you don't undestand this mathematical notation, I suggest you to read section III of: http://glaros.dtc.umn.edu/gkhome/slim/overview """ alpha = l1_reg + l2_reg l1_ratio = l1_reg / alpha model = SGDRegressor( penalty='elasticnet', fit_intercept=False, alpha=alpha, l1_ratio=l1_ratio, ) # TODO: get dimensions in the right way m, n = A.shape # Fit each column of W separately W = lil_matrix((n, n)) for j in range(n): if j % 50 == 0: print('-> %2.2f%%' % ((j / float(n)) * 100)) aj = A[:, j].copy() # We need to remove the column j before training A[:, j] = 0 model.fit(A, aj.toarray().ravel()) # We need to reinstate the matrix A[:, j] = aj w = model.coef_ # Removing negative values because it makes no sense in our approach w[w < 0] = 0 for el in w.nonzero()[0]: W[(el, j)] = w[el] return W
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ model = SGDRegressor() model.fit(features, values) intercept = model.intercept_ params = model.coef_ return intercept, params
def sgd(pd, pl, qd, ql): params = {'loss':['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'], 'alpha':expon(scale=1), 'epsilon':expon(scale=1), 'l1_ratio':uniform(), 'penalty':[ 'l2', 'l1', 'elasticnet']} clf = SGDRegressor() #clf = RandomizedSearchCV(clf, params, n_jobs=2, n_iter=10, verbose=10) print("Training Linear SVM Randomly") clf.fit(pd, pl) print("Score: " + str(clf.score(qd, ql))) return clf
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ ########################### ### YOUR CODE GOES HERE ### ########################### classifier = SGDRegressor(n_iter = 20) classifier.fit(features, values) intercept = classifier.intercept_ params = classifier.coef_ return intercept, params
def main(train_file, model_file): #train_x, train_y = load_sparse_trainingData_memory(train_file, 2 * get_len_vector()) train_x, train_y = load_long_training_data_memory() #train_x, train_y = load_trainingData(train_file) logging('len of y: %d' % train_y.shape) logging(train_x.shape) #LR = LinearRegression(copy_X = False, normalize = True) LR = SGDRegressor(verbose=1) logging("training model...") starttime = datetime.now() LR.fit(train_x, train_y) logging("training model, eplased time:%s" % str(datetime.now() - starttime)) logging("saving model") joblib.dump(LR, model_file)
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ y = values X = features clf = SGDRegressor() clf.fit(X, y) intercept = clf.intercept_ params = clf.coef_ return intercept, params
def sgd_regressor(x, y, alpha): kf = KFold(len(x), n_folds=3) scores = [] for train_index, test_index in kf: X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] scaler = StandardScaler() scaler.fit(X_train) x_train = scaler.transform(X_train) x_test = scaler.transform(X_test) clf = SGDRegressor(loss='squared_loss', alpha=alpha) clf.fit(x_train, y_train) scores.append(mean_squared_error(clf.predict(x_test), y_test) ** 0.5) # print 'SGDRegressor' return np.mean(scores)
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ #features = sm.add_constant(features) model = SGDRegressor(n_iter=30) #normalize_res = normalize_features(features) model.fit(features,values) ########################### ### YOUR CODE GOES HERE ### ########################### intercept = model.intercept_ params = model.coef_ return intercept, params
def build_sgd_regressor(X_test, X_train_full, y_train_full): #print "Building SGD regressor..." rf = SGDRegressor(loss="modified_huber", penalty="elasticnet", n_iter=20000, alpha=0.1, epsilon=0.01) probas_rf = rf.fit(X_train_full, y_train_full).predict(X_test) return probas_rf
def linear_regression_GD(features, values): means, std_devs, features = normalized_features(features) model = SGDRegressor(eta0=0.001) results = model.fit(features, values) intercept = results.intercept_ params = results.coef_ return intercept, params
def predictLinearRegress(attributeList, starTargetList): print("\nLinear Regression") starTargetList = np.array(starTargetList) Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75) lr = ml.linear.linearRegress(Xtrain, Ytrain) yHatInitial = lr.predict(Xtest) print("MSE test: ", mean_squared_error(yHatInitial, Ytest)) print("RMSE test: ", math.sqrt(mean_squared_error(yHatInitial, Ytest))) incorrect = 0 total = 0 for i, value in enumerate(yHatInitial): if(abs(yHatInitial[i] - Ytest[i]) > 0.5): incorrect += 1 total += 1 ratioIncorrect = float(float(incorrect) / float(total)) print("Ratio incorrect: " + str(ratioIncorrect)) onesCol = np.ones((len(Xtrain),1)) Xtrain = np.concatenate((onesCol, Xtrain), 1) onesCol = np.ones((len(Xtest),1)) Xtest = np.concatenate((onesCol, Xtest), 1) m, n = np.shape(Xtrain) clf = SGDRegressor(loss="squared_loss") clf.fit(Xtrain, Ytrain) yHat = clf.predict(Xtest) print("MSE after GD: ", mean_squared_error(yHat, Ytest)) print("RMSE after GD: ", math.sqrt(mean_squared_error(yHat, Ytest))) incorrect = 0 total = 0 for i, value in enumerate(yHat): if(abs(yHat[i] - Ytest[i]) > 0.5): incorrect += 1 total += 1 ratioIncorrect = float(float(incorrect) / float(total)) print("Ratio incorrect: " + str(ratioIncorrect))
class EdenRegressor(BaseEstimator, RegressorMixin): """Build a regressor for graphs.""" def __init__(self, r=3, d=8, nbits=16, discrete=True, normalization=True, inner_normalization=True, penalty='elasticnet', loss='squared_loss'): """construct.""" self.set_params(r, d, nbits, discrete, normalization, inner_normalization, penalty, loss) def set_params(self, r=3, d=8, nbits=16, discrete=True, normalization=True, inner_normalization=True, penalty='elasticnet', loss='squared_loss'): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.model = SGDRegressor( loss=loss, penalty=penalty, average=True, shuffle=True, max_iter=5, tol=None) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self def transform(self, graphs): """transform.""" x = self.vectorizer.transform(graphs) return x @timeit def kernel_matrix(self, graphs): """kernel_matrix.""" x = self.transform(graphs) return metrics.pairwise.pairwise_kernels(x, metric='linear') def fit(self, graphs, targets, randomize=True): """fit.""" x = self.transform(graphs) self.model = self.model.fit(x, targets) return self def predict(self, graphs): """predict.""" x = self.transform(graphs) preds = self.model.predict(x) return preds def decision_function(self, graphs): """decision_function.""" return self.predict(graphs)
def gradiantDescent(trainData,testData,trainOuts,testOuts): clf = SGDRegressor(loss="squared_loss") print(clf.fit(trainData,trainOuts)) print(clf.coef_) predictions = clf.predict(testData) print(predictions) misses,error = sup.crunchTestResults(predictions,testOuts,.5) print(1-error)
def linear_regression(features, values): sgd = SGDRegressor() results = sgd.fit(values, features) intercept = sgd.intercept_ params = results.get_params() return intercept, params
def train(self): X_train = np.vstack([self.lang_1_w2v[p[0]] for p in self.bilingual_mappings]) Z_train = np.vstack([self.lang_2_w2v[p[1]] for p in self.bilingual_mappings]) # there's a trick here -- train each y column separately (as a separate SGD problem. They are all independent # so this should be ok Z_cols = [Z_train[:,i] for i in range(Z_train.shape[1])] # train a model for each row of W, and get the W coefficients trained_coef_rows = [] for z in Z_cols: clf = SGDRegressor() clf.fit(X_train, z) trained_coef_rows.append(clf.coef_) # now stack all the rows together to reconstruct W self.W = np.vstack(trained_coef_rows)
def sgd_text_model(x_train, y_train, x_test, x_valid, cache_name, use_cache=False): if use_cache: fhand = open(cache_name, 'r') data_dict = pickle.load(fhand) return data_dict['test_pred'], data_dict['valid_pred'] np.random.seed(seed=123) model = SGDRegressor(eta0=1000, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='huber', n_iter=200, p=None, penalty='l1', power_t=.1, random_state=123, rho=None, shuffle=True, verbose=0, warm_start=False) model.fit(x_train, y_train) test_pred = model.predict(x_test) valid_pred = model.predict(x_valid) data_dict = {'test_pred': test_pred, 'valid_pred': valid_pred} fhand = open(cache_name, 'w') pickle.dump(data_dict, fhand) fhand.close() return test_pred, valid_pred
def fit(self, train_features, train_labels, N, c_val=0.0001, tol_val=0.001): # break features into N sets feat_dim = train_features.shape[1] feat_per_bag = feat_dim / N self.SVRs = [] for i in range(N): if i < N-1: cur_train_feat_bag = train_features[:,i*feat_per_bag:(i+1)*feat_per_bag] else: cur_train_feat_bag = train_features[:,i*feat_per_bag:] # now train individual SVR # model = svm.SVR(C=c_val, kernel='linear', tol=tol_val) # model = LSVR(C=c_val, tol=tol_val) model = SGDR(loss='epsilon_insensitive',alpha=c_val) print 'current training on dimensionality: ', cur_train_feat_bag.shape[1], '\n' model.fit(cur_train_feat_bag, train_labels) self.SVRs.append(model) return self.SVRs
def linear_regression(features, values): model = SGDRegressor(n_iter=1000) results = model.fit(features, values) intercept = results.intercept_ params = results.coef_ return intercept, params
def apply_sgd_( X_train, Y_train, alpha=0.0003, shuffle=True): n_iter = np.ceil(10**6 / len(Y_train)) model = SGDRegressor(loss='squared_loss', penalty='l2', alpha=alpha, epsilon=0.01, fit_intercept=True, n_iter=n_iter, shuffle=shuffle, random_state=int(time.time()*8192)%8192, warm_start=False, verbose=0, learning_rate='invscaling' ) # model.fit_transform( X_train, Y_train ) # model.partial_fit_transform( X_train, Y_train ) # sample_weights = [ 1/float(m) for x in Y ] model.fit( X_train, Y_train, sample_weight=None ) Theta = [ float( model.intercept_ ) , ] Theta.extend( [ float( x ) for x in model.coef_]) ( model, Theta, J, SCORE ) = performance_analysis( model, Theta, X_train, Y_train, debug=1 ) return ( model, Theta, J, SCORE )
def SGD_Regression(kf,data,label,k): val=0 for train, test in kf: X_train, X_test, y_train, y_test = data[train,:], data[test,:], label[train], label[test] log = SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15,n_iter=5) logit = log.fit(X_train,y_train) y_pred = logit.predict(X_test) val += metrics.mean_squared_error(y_test, y_pred) return val/3 # print "SGD_Regression, Mean Squared Error ", "{0:.4f}".format(val/3)
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ regressor = SGDRegressor() result = regressor.fit(features, values) intercept = result.intercept_ params = result.coef_ return intercept, params
def sgd_cv_id_fold(trn, params={}, tst=None, model_seed=CONST.SEED): if tst is not None: preds = tst[['Engine']].copy() cv_id = utils.get_cv_id(model_seed) trn = trn.merge(cv_id, on=['Engine'], how='left') assert trn.cv_id.notnull().all() valid_preds = pd.DataFrame({ 'preds': [np.nan] * trn.shape[0], 'actual_RUL': trn.RUL }) features = [c for c in trn.columns if c not in CONST.EX_COLS] scaler = preprocessing.StandardScaler() trn.loc[:, features] = scaler.fit_transform(trn.loc[:, features]) if tst is not None: tst.loc[:, features] = scaler.transform(tst.loc[:, features]) for i in list(range(1, utils.get_config()['nfold'] + 1)): print(f"CV ID = {i}") X_train, y_train = trn.loc[trn.cv_id != i, features], trn.loc[trn.cv_id != i, 'RUL'] X_valid, y_valid = trn.loc[trn.cv_id == i, features], trn.loc[trn.cv_id == i, 'RUL'] model = SGDRegressor(**params, random_state=seed) model.fit(X_train, y_train) valid_preds.loc[trn.cv_id == i, 'preds'] = model.predict(X_valid) if tst is not None: preds[f'fold{i + 1}'] = model.predict(tst[features]) valid_preds.dropna(inplace=True) if tst is None: print("CV MAE Score :", mean_absolute_error(valid_preds.actual_RUL, valid_preds.preds)) return mean_absolute_error(valid_preds.actual_RUL, valid_preds.preds) else: return mean_absolute_error(valid_preds.actual_RUL, valid_preds.preds), preds
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ ########################### ### YOUR CODE GOES HERE ### ########################### clf = SGDRegressor(alpha=0.1, n_iter=20) clf.fit(features, values) params = clf.get_params() intercept = 0 #print params #print len(features[0]), len(clf.coef_) #print clf.coef_ #print clf.intercept_ return clf.intercept_, clf.coef_
class SGDPlainNystromRegressor: def __init__(self, kernel: str = 'rbf', m: int = 100, lambda_reg: int = 0, **kwargs): self.projector = PlainNystrom(kernel=kernel, m=m) self.lambda_reg = lambda_reg self.coeffs = None self.regressor = SGDRegressor(fit_intercept=False, **kwargs) self.kwargs = kwargs def fit(self, X: np.ndarray, y: np.array = None, **kwargs): k_nm = self.projector.fit_transform(X=X, y=y, **kwargs) self.regressor.fit(k_nm, y) return self def predict(self, X): projection = self.projector.transform(X=X) return self.regressor.predict(projection)
def fit(self, U, Y): self.initialize() #learn X #X = self.getX(U,Y) X = self.getXBatched(U,Y,TSData.batchSize) print("Starting to train the model...") #clf = ElasticNet(alpha=5,l1_ratio=0.5,max_iter=50000) #for x1,y1 in izip(X,Y): # clf.partial_fit(x1[np.newaxis,:], y1) #If not using generator X = np.array([i for i in X]) #X = np.array(X) print(X.shape) print(Y.shape) clf = SGDRegressor(n_iter=100) clf.fit(X,np.ravel(Y)) print(metrics.mean_absolute_error(clf.predict(X),Y)) print(TSData().getScore(Y, clf.predict(X))) self.clf = clf
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ clf = SGDRegressor(n_iter=20) results = clf.fit(features, values) intercept = results.intercept_ params = results.coef_ return intercept, params
def test_not_robust_regression(loss, weighting): reg = RobustWeightedRegressor( loss=loss, max_iter=100, weighting=weighting, k=0, c=1e7, burn_in=0, random_state=rng, ) reg_not_rob = SGDRegressor(loss=loss, random_state=rng) reg.fit(X_r, y_r) reg_not_rob.fit(X_r, y_r) pred1 = reg.predict(X_r) pred2 = reg_not_rob.predict(X_r) difference = [ np.linalg.norm(pred1[i] - pred2[i]) for i in range(len(pred1)) ] assert np.mean(difference) < 1 assert_almost_equal(reg.score(X_r, y_r), r2_score(y_r, reg.predict(X_r)))
def test_huber_and_sgd_same_results(): # Test they should converge to same coefficients for same parameters X, y = make_regression_with_outliers(n_samples=10, n_features=2) # Fit once to find out the scale parameter. Scale down X and y by scale # so that the scale parameter is optimized to 1.0 huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100, epsilon=1.35) huber.fit(X, y) X_scale = X / huber.scale_ y_scale = y / huber.scale_ huber.fit(X_scale, y_scale) assert_almost_equal(huber.scale_, 1.0, 3) sgdreg = SGDRegressor( alpha=0.0, loss="huber", shuffle=True, random_state=0, max_iter=10000, fit_intercept=False, epsilon=1.35, tol=None) sgdreg.fit(X_scale, y_scale) assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1)
def perform_sgd_regression(features, values): clf = SGDRegressor(n_iter=20) clf = clf.fit(features, values) intercept = clf.intercept_ params = clf.coef_ print "intercept:" print intercept print "params:" for i in range(len(params)): print "%s: %f" %(features.columns.values[i], params[i])
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ model = SGDRegressor() sgd = model.fit(features, values) intercept = sgd.intercept_ params = sgd.coef_ return intercept, params
def early_stoping(X, y): from sklearn.base import clone from sklearn.linear_model import SGDRegressor from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split # 当warm_start=True时,调用fit()方法后,训练会从停下来的地方继续,而不是从头重新开始。 sgd_reg = SGDRegressor(max_iter=1, warm_start=True, penalty=None, learning_rate="constant", eta0=0.0005) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) minimum_val_error = float("inf") best_epoch = None best_model = None for epoch in range(1000): sgd_reg.fit(X_train, y_train.ravel()) y_val_predict = sgd_reg.predict(X_val) val_error = mean_squared_error(y_val_predict, y_val) if val_error < minimum_val_error: minimum_val_error = val_error best_epoch = epoch best_model = clone(sgd_reg) print('stopping in:', best_epoch)
def sgd_test(): # 获取数据 lb = load_boston() # 分割数据集到训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25) print(y_train, '\n', y_test) # 进行数据标准化处理(特征值与目标值都得标准化-目标值根据特征值求出[参考公式]) [两个标准化api:特征值[多列]/目标值[因为只有一列]] # 特征值 标准化 std_x = StandardScaler() x_train = std_x.fit_transform(x_train) x_test = std_x.transform(x_test) # 目标值 标准化 std_y = StandardScaler() # 参数得是二维 y_train = std_y.fit_transform(y_train.reshape(-1, 1)) y_test = std_y.transform(y_test.reshape(-1, 1)) # 梯度下降进行预测 sg = SGDRegressor() sg.fit(x_train, y_train) print(sg.coef_) # 进行预测 x_predict_res = sg.predict(x_test) # 将标准化结果反转为非标准化之前的 stand_pre = std_y.inverse_transform(x_predict_res) print(stand_pre) # 真实样本 分割数据集的预测结果 与 梯度下降的结果 print( '均方误差测试:\n', mean_absolute_error(std_y.inverse_transform(y_test), std_y.inverse_transform(x_predict_res))) pass
def train(training_pandas_data, test_pandas_data, label_col, feat_cols, alpha, l1_ratio, max_iter, tol, training_data_path, test_data_path): print("train: " + training_data_path) print("test: " + test_data_path) print("alpha: ", alpha) print("l1-ratio: ", l1_ratio) print("max_iter: ", max_iter) print("tol: ", tol) print("label-col: " + label_col) for col in feat_cols: print("feat-cols: " + col) # Split data into training labels and testing labels. trainingLabels = training_pandas_data[label_col].values trainingFeatures = training_pandas_data[feat_cols].values testLabels = test_pandas_data[label_col].values testFeatures = test_pandas_data[feat_cols].values #We will use an SGD model. en = SGDRegressor(alpha=alpha, l1_ratio=l1_ratio, warm_start=True, max_iter=max_iter, tol=tol) # Here we train the model. en.fit(trainingFeatures, trainingLabels) # Calculating the scores of the model. test_rmse = mean_squared_error(testLabels, en.predict(testFeatures))**0.5 r2_score_training = en.score(trainingFeatures, trainingLabels) r2_score_test = en.score(testFeatures, testLabels) print("Test RMSE:", test_rmse) print("Training set score:", r2_score_training) print("Test set score:", r2_score_test) #Logging the RMSE and r2 scores. mlflow.log_metric("Test RMSE", test_rmse) mlflow.log_metric("Train R2", r2_score_training) mlflow.log_metric("Test R2", r2_score_test) #Saving the model as an artifact. sklearn.log_model(en, "model")
def main(): vitals = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate'] features_train = pd.read_csv('data/train_engineered_4.csv') labels_train = pd.read_csv('data/train_labels.csv') features_predict = pd.read_csv('data/test_engineered_4.csv') # set reduced_size to reduce batch size reduced_size = len(features_predict) # reduced_size = 800 prediction = pd.DataFrame(features_predict['pid']).iloc[0:reduced_size,:] metrics_summary = pd.DataFrame(columns=vitals) hyperparams = pd.DataFrame(columns=vitals) for label in vitals: X_train = np.array(features_train)[0:reduced_size] y_train = np.array(labels_train[label])[0:reduced_size] X_predict = np.array(features_predict)[0:reduced_size] # scaling data scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_predict = scaler.fit_transform(X_predict) model = SGDRegressor(penalty='elasticnet', alpha=0.05, l1_ratio=0.1) print() print('learning : ', label) model.fit(X_train, y_train) #predict on the provided test set y_predicted = model.predict(X_predict) # y_predicted = grid.predict(X_predict) prediction[label] = y_predicted print(prediction) return prediction
def linear_regression(features, values): """ Perform linear regression given a data set with an arbitrary number of features. """ y = values X = features clf = SGDRegressor(n_iter=100) result = clf.fit(X, y) params = result.coef_ intercept = result.intercept_ return intercept, params
def SGD_boston(): boston = load_boston() x = boston.data y = boston.target train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=.25) std_s = StandardScaler() train_x = std_s.fit_transform(train_x) test_x = std_s.fit_transform(test_x) sgd = SGDRegressor() sgd.fit(train_x, train_y) score = sgd.score(test_x, test_y) predict_y = sgd.predict(test_x) print(score) print(predict_y[:20]) print(test_y[:20]) # print(sgd.coef_) # print(sgd.intercept_) return None
def do_lreg_training_runs(d, cfg): times = np.array([]) for i in range(cfg.num_runs): print('***Run #%d***' % i) start = time.perf_counter() model = SGDRegressor( eta0=cfg.learning_rate, #learning rate bias max_iter=cfg.epochs, random_state=42) model.fit(d.x_train, d.y_train) elapsed = time.perf_counter() - start times = np.append(times, elapsed) trace_model_filename = f'{model_filename(cfg, i)}.joblib' dump(model, trace_model_filename) print(f"#### dump model [{trace_model_filename}]") print_times(times)
def test_regression(): torch.set_default_tensor_type('torch.DoubleTensor') net_ctor = lambda: N.Linear(13, 1) loss = F.mse_loss # Supports fit, predict, and score x, y = load_boston(return_X_y=True) model = TorchEstimator(net_ctor, loss, opt_ctor='Adam', lr=1e-3) model.fit(x, y, epochs=5) model.predict(x) model.score(x, y) # Comparable to sklearn linear regression theirs = SGDRegressor(max_iter=5, eta0=1e-3) theirs.fit(x, y) h_theirs = theirs.predict(x) h_ours = model.predict(x) mse_theirs = mean_squared_error(y, h_theirs) mse_ours = mean_squared_error(y, h_ours) assert mse_ours < mse_theirs # torch is better than sklearn by a lot
def test_mbsgd_regressor_default(datatype, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_mbsgd_regressor = cumlMBSGRegressor() cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) if nrows < 500000: skl_sgd_regressor = SGDRegressor() skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02
class TestingExercise3_06(unittest.TestCase): def setUp(self) -> None: ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) self.data = pd.read_csv( os.path.join(ROOT_DIR, '..', 'Datasets', 'synth_temp.csv')) def test_SGD(self): self.data = self.data.loc[self.data.Year > 1901] self.data_group_year = self.data.groupby(['Year' ]).agg({'RgnAvTemp': 'mean'}) self.data_group_year['Year'] = self.data_group_year.index self.data_group_year = self.data_group_year.rename( columns={'RgnAvTemp': 'AvTemp'}) self.X_min = self.data_group_year.Year.min() self.X_range = self.data_group_year.Year.max( ) - self.data_group_year.Year.min() self.Y_min = self.data_group_year.AvTemp.min() self.Y_range = self.data_group_year.AvTemp.max( ) - self.data_group_year.AvTemp.min() self.scale_X = (self.data_group_year.Year - self.X_min) / self.X_range self.train_X = self.scale_X.ravel() self.train_Y = ((self.data_group_year.AvTemp - self.Y_min) / self.Y_range).ravel() np.random.seed(42) self.model = SGDRegressor(loss='squared_loss', max_iter=100, learning_rate='constant', eta0=0.0005, tol=0.00009, penalty='none') self.model.fit(self.train_X.reshape((-1, 1)), self.train_Y) self.Beta0 = ( self.Y_min + self.Y_range * self.model.intercept_[0] - self.Y_range * self.model.coef_[0] * self.X_min / self.X_range) self.Beta1 = self.Y_range * self.model.coef_[0] / self.X_range self.pred_X = self.data_group_year['Year'] self.pred_Y = self.model.predict(self.train_X.reshape((-1, 1))) self.r2 = r2_score(self.train_Y, self.pred_Y) self.assertEqual(round(self.r2, 3), (0.544))
def myLinear(): """ 线性回归直接预测房子价格 """ # 获取数据 lb = load_boston() # 分割数据集到训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25) # 进行标准化处理, 目标值需要进行标准化处理吗 # 实例化两个标准化API std_x = StandardScaler() x_train = std_x.fit_transform(x_train) x_test = std_x.fit_transform(x_test) # 目标值 std_y = StandardScaler() y_train = std_y.fit_transform(y_train) y_test = std_y.fit_transform(y_test) # estimator预测 # 正规方程求解1 lr = LinearRegression() lr.fit(x_train, y_train) # 权重参数 print(lr.coef_) # 预测价格 y_lr_predict = std_y.inverse_transform(lr.predict(x_test)) print('测试集里面每个房子的预测价格: ', y_lr_predict) print('正规方程的均方误差:', mean_squared_error(std_y.inverse_transform(y_test), y_lr_predict)) # 梯度下降进行房子价格预测 sgd = SGDRegressor() sgd.fit(x_train, y_train) print(sgd.coef_) # 预测测试集的价格 y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test)) print('测试集里面每个房子的预测价格: ', y_sgd_predict) print('梯度下降的均方误差:', mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict)) return None
def line_lineregression(): # 读取并分割数据 data = load_boston() x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25) # 标准化(与分类问题有所不同,目标值也要标准化,在最终显示时,可用obj.inverse_transform()反标准化) std1 = StandardScaler() std2 = StandardScaler() # 对特征值标准化 -- fit_transform()及transform()的使用可参考 "0.0.训练集、测试集的特征工程.py" x_train = std1.fit_transform(x_train) x_test = std1.transform(x_test) # 对目标值标准化 y_train = std2.fit_transform(y_train.reshape((-1, 1))) # 由一维变二维,算法要求的格式 # y_test = std2.transform(y_test.reshape((-1,1))) # 同上 # 实例化预预估器 lr_1 = LinearRegression() sgd_1 = SGDRegressor() # 学习率默认 # 1.正规方程求解 lr_1.fit(x_train, y_train) # 训练模型 print(lr_1.coef_) # 训练得到的模型参数 result_1 = lr_1.predict(x_test) # 得到的预测值 result_1 = std2.inverse_transform(result_1) # 反标准化 print("正规求解预测房价为:", result_1) print("正规求解均方误差为:", mean_squared_error(y_test, result_1)) print("*" * 30) # 2.梯度下降求解 sgd_1.fit(x_train, y_train) print(sgd_1.coef_) result_2 = sgd_1.predict(x_test) result_2 = std2.inverse_transform(result_2) # 反标准化 print("梯度下降求解预测房价为:", result_2) print("梯度下降求解均方误差为:", mean_squared_error(y_test, result_2))
def get_sgd(X_train, X_test, y_train, y_test): temp_max_itr = 100000 dest_eta = 1e-5 dest_tol = 1e-3 temp_coef = 0.01 dest_coef = temp_coef dest_intercept = 0.0 max = -1000 # mode = 'w' # cnt = 1 while temp_coef <= 2.0: temp_intercept = 0.0 while temp_intercept <= 50.0: sgd = SGDRegressor(random_state=15, max_iter=temp_max_itr, eta0=dest_eta, tol=dest_tol, n_iter_no_change=6) sgd.fit(X_train, y_train, coef_init=temp_coef, intercept_init=temp_intercept) scr = sgd.score(X_test, y_test) #Checking if scored more than previous max score if max < scr: max = scr dest_coef = temp_coef dest_intercept = temp_intercept # if cnt > 1 : # mode = 'a' # cnt += 1 # write_to_file(scr,dest_coef, dest_intercept, mode) temp_intercept += 1.0 temp_coef += 0.1 sgd1 = SGDRegressor(random_state=15, max_iter=temp_max_itr, eta0=dest_eta, tol=dest_tol, n_iter_no_change=6) return sgd1, dest_coef, dest_intercept
class SGDRegressionModel(RegressionModel): def __init__(self, train_data): RegressionModel.__init__(self, train_data) self.model = SGDRegressor() def train(self, x=None, y=None): x = x if x is not None else self.train_x y = y if y is not None else self.train_y self.model.fit(x, y) def predict(self, x_in): return self.model.predict(x_in) def evaluate(self, x_in, y_out): return self.model.score(x_in, y_out) def save(self, filename): joblib.dump(self.model, filename) def load(self, filename): self.model = joblib.load(filename)
def test_sklreandata(): x, y = data_xy(sklearn_regdata=True) # My reg lr = SGDLinear_reg(100, eta=0.01, batch_size=10) lr.fit(x, y) lr.plot_train_loss() ## sklearn sgd sgd_reg = SGDRegressor(max_iter = 100, # 迭代次数 penalty = None, # 正则项为空 eta0 = 0.01, # 学习率 early_stopping=True ) sgd_reg.fit(x,y) ## sklearn reg lrskt = LinearRegression(fit_intercept=True) lrskt.fit(x, y) print(f'my linear loss: {lr.cost(y, lr.predict(x)):.2f}') print(f'sklearn loss: {lr.cost(y, lrskt.predict(x)):.2f}') print(f'sklearn sgd loss: {lr.cost(y, sgd_reg.predict(x)):.2f}')
def test_mbsgd_regressor(lrate, penalty, make_dataset): nrows, datatype, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) if nrows < 500000: skl_sgd_regressor = SGDRegressor(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02
def runSGD(X_train, X_test, y_train, y_test,dataname): all_epsilon = [0.001, 0.1 ,0.5, 0.9] best_model=None max_score=0 for epsilon in all_epsilon: regressor = SGDRegressor(loss='epsilon_insensitive',epsilon=epsilon) regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) # plt.show() plt.scatter(y_test, y_pred) plt.plot([y_test.min(), y_test.max()], [y_pred.min(), y_pred.max()], 'r', lw=2) score = regressor.score(X_test, y_test) if score>max_score: best_model=regressor plt.title('SGD - {0}\n epsilon ={1} \nScore = {2:.3f} '.format(str(dataname),epsilon, score)) plt.xlabel('Actual ') plt.ylabel('Predict') # plt.show() plt.savefig('runSGD_{}_{}.png'.format(strftime("%H_%M_%S", gmtime()),epsilon)) plt.close() return best_model
def regularization(): """Plot error vs iterations.""" alphaList = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000] xTrain = db.load('train', 'data')[:, 1:].astype(int) yTrain = np.squeeze(db.load('train', 'y')[:, 1:]) xValid = db.load('valid', 'data')[:, 1:].astype(int) yValid = np.squeeze(db.load('valid', 'y')[:, 1:]) eTrain = np.zeros(len(alphaList)) eValid = np.zeros(len(alphaList)) for i in range(len(alphaList)): for _ in range(5): model = SGDRegressor(penalty='l2', learning_rate='constant', eta0=0.006, max_iter=100) model.fit(xTrain, yTrain) eTrain[i] += np.sqrt(mean_squared_error(yTrain, model.predict(xTrain))) eValid[i] += np.sqrt(mean_squared_error(yValid, model.predict(xValid))) eTrain[i] /= 5 eValid[i] /= 5 plt.semilogx(alphaList, eTrain, label='Training') plt.semilogx(alphaList, eValid, label='Validation') plt.legend() plt.show()
def linear(): """ 线性回归直接预测房子价格 :return: """ lb=load_boston() #分割数据集到训练集和测试集 x_train,x_test,y_train,y_test=train_test_split(lb.data,lb.target,test_size=0.25) #进行标准化处理 #必须对特征值和目标值进行标准化处理,实例化两个标准化API std_x=StandardScaler() x_train=std_x.fit_transform(x_train) x_test=std_x.transform(x_test) #目标值 std_y=StandardScaler() #sklearn0.19版本传进的必须是二维数组reshape(-1,1),由于样本数不知道,则直接填-1,目标值只有一个 y_train=std_y.fit_transform(y_train.reshape(-1,1)) y_test=std_y.transform(y_test.reshape(-1,1).reshape(-1,1)) print(y_train) #estimator预测 #正规方程求解方式预测结果 lr=LinearRegression() lr.fit(x_train,y_train) print(lr.coef_) #预测测试集的房子价格 y_predict=lr.predict(x_test) #转化回标准化之前的数据形式 y_predict=std_y.inverse_transform(y_predict) print("正规方程预测每个房子的价格:",y_predict) print("正规方程的均方误差:",mean_squared_error(std_y.inverse_transform(y_test),y_predict)) #梯度下降 sgd=SGDRegressor() sgd.fit(x_train,y_train) print(sgd.coef_) #岭回归 #alpha正则化力度 rd=Ridge(alpha=1.0) rd.fit(x_train,y_train) print(rd.coef_)
def Linear(): # 获取数据 lb = load_boston() # 分割数据---将数据分割成为“训练集”和“测试集” 返回结果 (训练集、测试集的特征值),(训练集、测试集的目标值) x_train, x_test, y_train, y_test = tts(lb.data, lb.target, test_size=0.25) # print(y_train, 'weqw\n', y_test) # 标准化处理(特征值和目标值都要标准化处理) 要实例化2个的api std_x = StandardScaler() x_train = std_x.fit_transform(x_train) x_test = std_x.transform(x_test) std_y = StandardScaler() # 要转化为 二维数组 y_train = std_y.fit_transform(y_train.reshape(-1, 1)) y_test = std_y.transform(y_test.reshape(-1, 1)) # 正规方程 lr = LinearRegression() lr.fit(x_train, y_train) print("LR各个特征的权重", lr.coef_) predict = lr.predict(x_test) # 要进行反标准化得到值 print("LR预测测试集的房子价格", std_y.inverse_transform(predict)) # 参数是 真实值 和预测值 print( "正规方程的均方误差", err(std_y.inverse_transform(y_test), std_y.inverse_transform(predict))) # 梯度下降 SGD = SGDRegressor() SGD.fit(x_train, y_train) print("SGD各个特征的权重", SGD.coef_) _predict = SGD.predict(x_test) # 要进行反标准化得到值 print("SGD预测测试集的房子价格", std_y.inverse_transform(_predict)) print( "梯度下降的均方误差", err(std_y.inverse_transform(y_test), std_y.inverse_transform(_predict))) return None
def linear_model2(): #1. data = load_boston() #2. x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=22) #3. Stan = StandardScaler() x_train = Stan.fit_transform(x_train) x_test = Stan.fit_transform(x_test) #4 linear = SGDRegressor(max_iter=1000) #最大迭代次数 linear.fit(x_train, y_train) #5 ss = linear.predict(x_test) #print('梯度下降预测值是:',ss) print('梯度下降均方误差:', mean_squared_error(y_test, ss)) print('梯度下降系数:', linear.coef_)
def test_both_fit_and_score_contain_sample_weight(sample_weight_passed_as): mlflow.sklearn.autolog() from sklearn.linear_model import SGDRegressor # ensure that we use an appropriate model for this test assert "sample_weight" in _get_arg_names(SGDRegressor.fit) assert "sample_weight" in _get_arg_names(SGDRegressor.score) mock_obj = mock.Mock() def mock_score(self, X, y, sample_weight=None): # pylint: disable=unused-argument mock_obj(X, y, sample_weight) return 0 assert inspect.signature( SGDRegressor.score) == inspect.signature(mock_score) SGDRegressor.score = mock_score model = SGDRegressor() X, y = get_iris() sample_weight = abs(np.random.randn(len(X))) with mlflow.start_run() as run: if sample_weight_passed_as == "positional": model.fit(X, y, None, None, sample_weight) elif sample_weight_passed_as == "keyword": model.fit(X, y, sample_weight=sample_weight) mock_obj.assert_called_once_with(X, y, sample_weight) run_id = run.info.run_id params, metrics, tags, artifacts = get_run_data(run_id) assert params == truncate_dict( stringify_dict_values(model.get_params(deep=True))) assert {TRAINING_SCORE: model.score(X, y)}.items() <= metrics.items() assert tags == get_expected_class_tags(model) assert MODEL_DIR in artifacts assert_predict_equal(load_model_by_run_id(run_id), model, X)