def solveSingle(self,inputDF,outputDict,rho,beta_target): I,J,V,Y=[],[],[],[] fd = {} # mapping feature names to consecutive integers, starting with 0 for i,(id, x) in enumerate(inputDF.items()): l = outputDict.get(id) for k,v in x.items(): I.append(i) J.append(k) V.append(v) upd(fd,k) Y.append(l) J = map(lambda k: fd[k], J) X = sparse.coo_matrix((V,(I,J)),shape=(I[-1]+1,len(fd))) fd_reverse = [k for k,v in sorted(fd.items(), key = lambda t: t[1])] # y_new = y - X . beta_target # converting a proximal least square problem to a ridge regression ZmUl = np.array([beta_target.get(k,0) for k in fd_reverse]) y_new = np.array(Y) - X * ZmUl ridge = Ridge(alpha = rho , fit_intercept=False) ret = ridge.fit(X,y_new) #ret = self.lr.fit(X,y_new) # ordered list of feature names according to their integer ids in fd #raise ValueError('fd_reverse = %s \n X = %s \n J = %s \n I = %s \n V = %s \n Y = %s \n y_new = %s \n ret.coef_ = %s \n ZmUl = %s \n'\ # %(str(fd_reverse), str(X), str(J), str(I), str(V), str(Y), str(y_new), str(ret.coef_), str(ZmUl))) return dict(zip(fd_reverse, (ret.coef_ + ZmUl).tolist()))
def test_brr_like_sklearn(): n = 10000 d = 10 sigma_sqr = 5 X = np.random.randn(n, d) beta_true = np.random.random(d) y = np.dot(X, beta_true) + np.sqrt(sigma_sqr) * np.random.randn(n) X_tr = X[:n / 2, :] y_tr = y[:n / 2] X_ts = X[n / 2:, :] # y_ts = y[n / 2:] # prediction with my own bayesian ridge lambda_reg = 1 brr = BayesianRidgeRegression(lambda_reg, add_ones=True, normalize_lambda=False) brr.fit(X_tr, y_tr) y_ts_brr = brr.predict(X_ts) # let's compare to scikit-learn's ridge regression rr = Ridge(lambda_reg) rr.fit(X_tr, y_tr) y_ts_rr = rr.predict(X_ts) assert np.mean(np.abs(y_ts_brr - y_ts_rr)) < 0.001, \ "Predictions are different from sklearn's ridge regression."
def test_sag_regressor_computed_correctly(): """tests if the sag regressor is computed correctly""" alpha = .1 n_features = 10 n_samples = 40 max_iter = 50 tol = .000001 fit_intercept = True rng = np.random.RandomState(0) X = rng.normal(size=(n_samples, n_features)) w = rng.normal(size=n_features) y = np.dot(X, w) + 2. step_size = get_step_size(X, alpha, fit_intercept, classification=False) clf1 = Ridge(fit_intercept=fit_intercept, tol=tol, solver='sag', alpha=alpha * n_samples, max_iter=max_iter) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) spweights1, spintercept1 = sag_sparse(X, y, step_size, alpha, n_iter=max_iter, dloss=squared_dloss, fit_intercept=fit_intercept) spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha, n_iter=max_iter, dloss=squared_dloss, sparse=True, fit_intercept=fit_intercept) assert_array_almost_equal(clf1.coef_.ravel(), spweights1.ravel(), decimal=3) assert_almost_equal(clf1.intercept_, spintercept1, decimal=1)
def ridgeReg(alpha): n_samples, n_features = 10, 5 y = np.random.randn(n_samples) X = np.random.randn(n_samples, n_features) clf = Ridge(.001) res=clf.fit(X, y) return(res)
def fit(self, X, Y, weights=None, context_transform=True): """ Trains policy by weighted maximum likelihood. .. note:: This call changes this policy (self) Parameters ---------- X: array-like, shape (n_samples, context_dims) Context vectors Y: array-like, shape (n_samples, weight_dims) Low-level policy parameter vectors weights: array-like, shape (n_samples,) Weights of individual samples (should depend on the obtained reward) """ # Kernel approximation self.nystroem = Nystroem( kernel=self.kernel, gamma=self.gamma, coef0=self.coef0, n_components=np.minimum(X.shape[0], self.n_components), random_state=self.random_state, ) self.X = self.nystroem.fit_transform(X) if self.bias: self.X = np.hstack((self.X, np.ones((self.X.shape[0], 1)))) if self.normalize: self.X /= np.abs(self.X).sum(1)[:, None] # Standard ridge regression ridge = Ridge(alpha=self.alpha, fit_intercept=False) ridge.fit(self.X, Y, weights) self.W = ridge.coef_
class OrderScorer(Scorer): def __init__(self): self.classifier = Ridge(alpha=0.1) self.cache_filename = 'subgraph_order_scorer_reg.pickle' def train(self, train_instances, train_labels, update_cache=True, sample_weight=None): """ Trains a scorer to score the quality of an ordering of sentences Loads from cache if available """ self.classifier.fit(train_instances, train_labels, sample_weight=sample_weight) if update_cache: pickle.dump(self.classifier, open(self.cache_filename, 'wb')) def test(self, test_instances, test_labels): """ Uses test set to evaluate the performance of the scorer and print it out """ scores = self.classifier.predict(test_instances) # TODO: print report def load(self): if os.path.exists(self.cache_filename): self.classifier = pickle.load(open(self.cache_filename, 'rb')) else: raise Exception("No classifier exists! Must call train with update_cache=True") def evaluate(self, test_instance): """ Applies the scoring function to a given test instance """ return self.classifier.predict([test_instance])[0]
def test_regressor_matching(): n_samples = 10 n_features = 5 rng = np.random.RandomState(10) X = rng.normal(size=(n_samples, n_features)) true_w = rng.normal(size=n_features) y = X.dot(true_w) alpha = 1. n_iter = 100 fit_intercept = True step_size = get_step_size(X, alpha, fit_intercept, classification=False) clf = Ridge(fit_intercept=fit_intercept, tol=.00000000001, solver='sag', alpha=alpha * n_samples, max_iter=n_iter) clf.fit(X, y) weights1, intercept1 = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=squared_dloss, fit_intercept=fit_intercept) weights2, intercept2 = sag(X, y, step_size, alpha, n_iter=n_iter, dloss=squared_dloss, fit_intercept=fit_intercept) assert_array_almost_equal(weights1, clf.coef_, decimal=10) assert_array_almost_equal(intercept1, clf.intercept_, decimal=10) assert_array_almost_equal(weights2, clf.coef_, decimal=10) assert_array_almost_equal(intercept2, clf.intercept_, decimal=10)
def training(X,Y,X_test, pca='kpca', regressor='ridge', dim=50): # X and Y are numpy arrays print 'Input data and label shape: ', X.shape, Y.shape if pca == 'nopca': return simpleTraining(X, Y, X_test, regressor) model, P = getProjectionMatrixPCA(Y, dim) if pca=='pca' else getProjectionMatrixKPCA(dim) Y_train = np.dot(Y, P) if pca=='kpca' else np.dot(Y,P.transpose()) regressors = [] for i in range(dim): print 'at regressor number: ', i reg = Ridge() if regressor=='ridge' else SVR() y = [x[i] for x in Y_train] reg.fit(X, y) regressors.append(reg) Z_pred = [] for reg in regressors: Z_pred.append(reg.predict(X_test)) print 'prediction shapes:' , len(Z_pred), len(Z_pred[0]) Z_pred = np.array(Z_pred) Y_pred = np.dot(P, Z_pred).transpose() if pca=='kpca' else np.dot(Z_pred.transpose(), P) return model, regressors, Y_pred
class LogisticRegressionSeparator(BaseEstimator): def get_params(self, deep=True): return {} def fit(self, X, y): # lets predict which users will spend anything later classes = y - X[:, 0] classes = np.where(classes > 0.1, 1, 0) self.classifier = LogisticRegression( class_weight='balanced') self.classifier.fit(X, classes) results = self.classifier.predict(X) results = results == 1 self.estimator = Ridge(alpha=0.05) self.estimator.fit(X[results], y[results]) def predict(self, X): y = X[:,0].reshape(X.shape[0]) labels = (self.classifier.predict(X) == 1) y[labels] = self.estimator.predict(X[labels]) return y
def train_single_model(train_data, train_labels, algo): """ Train the model for a single label dimension """ if algo == 'svr_rbf': """ SVM regression, RBF kernel """ svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_rbf.fit(train_data, train_labels) return svr_rbf if algo == 'svr_lin': """ SVM regression, linear """ svr_lin = SVR(kernel='linear') svr_lin.fit(train_data, train_labels) return svr_lin if algo == 'ridge': """ Ridge regression """ clf = Ridge(alpha = 0.5) clf.fit(train_data, train_labels) return clf # No hit algorithm print "unimplemented model type" return None
def regression_weight(self, matched_data): converted_data = {} for i, data in enumerate(matched_data): if i==0: for key in data.keys(): try: value = float(data[key]) converted_data[key] = [value] except ValueError: pass else: for key in data.keys(): if key in converted_data: converted_data[key].append(float(data[key])) sorted_key = sorted(converted_data.keys()) input_key = [key for key in sorted_key if key != self.main_key.lower()] x = [] for key in input_key: # normalization numpy_data = normalization(np.array(converted_data[key])) x.append(numpy_data) x = np.array(x).T y = normalization(np.array(converted_data[self.main_key.lower()])) regressor = Ridge(alpha=1.0, normalize=True) regressor.fit(x,y) sorted_result = np.array(input_key)[np.argsort(np.array(regressor.coef_))] sorted_result = sorted_result[::-1] coefficient = sorted(regressor.coef_, reverse = True) return [(sorted_result[i], coefficient[i]) for i in range(len(sorted_result))]
def regression_NumMosquitos(Xtr, ytr, Xte): from sklearn.linear_model import Ridge, RidgeCV #model_nm = RidgeCV(alphas=range(200, 401, 10), cv=5) model_nm = Ridge(alpha = 340) model_nm = model_nm.fit(Xtr, ytr) results_nm = model_nm.predict(Xte) return results_nm
def ridgeRegression(X,y): print("\n### ~~~~~~~~~~~~~~~~~~~~ ###") print("Ridge Regression") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myDegree = 40 polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False) Xp = polynomialFeatures.fit_transform(X) myScaler = StandardScaler() scaled_Xp = myScaler.fit_transform(Xp) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ridgeRegression = Ridge(alpha=1e-11,solver="cholesky") ridgeRegression.fit(scaled_Xp,y) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dummyX = np.arange(0,2,0.01) dummyX = dummyX.reshape((dummyX.shape[0],1)) dummyXp = polynomialFeatures.fit_transform(dummyX) scaled_dummyXp = myScaler.transform(dummyXp) dummyY = ridgeRegression.predict(scaled_dummyXp) outputFILE = 'plot-ridgeRegression.png' fig, ax = plt.subplots() fig.set_size_inches(h = 6.0, w = 10.0) ax.axis([0,2,0,15]) ax.scatter(X,y,color="black",s=10.0) ax.plot(dummyX, dummyY, color='red', linewidth=1.5) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def forecast_future_attention(train_index, test_index, alpha): """Forecast future attention via train dataset index and test dataset index.""" m, n = len(train_index), len(test_index) x_train_predict = attention_data[train_index, :num_train] x_test_predict = attention_data[test_index, :num_train] for i in xrange(num_train, age): if with_share == 1: x_train = np.hstack((x_train_predict, share_data[train_index, :i + 1])) x_test = np.hstack((x_test_predict, share_data[test_index, :i + 1])) norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1), share_data[train_index, :i + 1])) else: x_train = x_train_predict x_test = x_test_predict norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1))) x_train_norm = x_train / np.sum(norm, axis=1)[:, None] y_train = np.ones(m, ) # == == == == == == == == Training with Ridge Regression == == == == == == == == # predictor = Ridge(fit_intercept=False, alpha=alpha) predictor.fit(x_train_norm, y_train) # == == == == == == == == Iteratively add forecasted value to x matrix == == == == == == == == # predict_train_value = (predictor.predict(x_train) - np.sum(x_train, axis=1)).reshape(m, 1) predict_train_value[predict_train_value < 0] = 0 x_train_predict = np.hstack((x_train_predict, predict_train_value)) predict_test_value = (predictor.predict(x_test) - np.sum(x_test, axis=1)).reshape(n, 1) predict_test_value[predict_test_value < 0] = 0 x_test_predict = np.hstack((x_test_predict, predict_test_value)) return x_test_predict[:, num_train: age]
def ridge_regression(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True): """ :param train_x: train :param train_y: text :param pred_x: test set to predict :param review_id: takes in a review id :param v_curve: run the model for validation curve :param l_curve: run the model for learning curve :param get_model: run the model :return:the predicted values,learning curve, validation curve """ lin = Ridge(alpha=0.5) if get_model: print "Fitting Ridge..." lin.fit(train_x, np.log(train_y+1)) gbr_pred = np.exp(lin.predict(pred_x))- 1 for i in range(len(gbr_pred)): if gbr_pred[i] < 0: gbr_pred[i] = 0 Votes = gbr_pred[:, np.newaxis] Id = np.array(review_id)[:, np.newaxis] submission_lin= np.concatenate((Id,Votes),axis=1) np.savetxt("submission_ridge.csv", submission_lin,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='') if v_curve: print "Working on Validation Curves" plot_validation_curve(Ridge(), "Validation Curve for Ridge Regression", train_x, np.log(train_y+1.0), param_name="alpha", param_range=[0.1,0.2,0.5,1,10]) if l_curve: print "Working on Learning Curves" plot_learning_curve(Ridge(), "Learning Curve for Linear Regression", train_x, np.log(train_y+1.0))
def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=500,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) yqq=y_pred*(1.0/(j+1)) print j,llfun(y_test_cv,yqq) #y_pred/=m; clf=Ridge()#RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) clf.fit(X_train_cv,(y_train_cv)) y_pred=clf.predict(X_test_cv) print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
def knn_twice(k): knn1 = neighbors.KNeighborsRegressor(n_neighbors=k) knn1.fit(trainf,trainlab) print 'here' tim = time.time(); n = len(train)/1000 pred1 = [] for i in range(0,n): pred1.extend(knn1.predict(trainf[(i*1000):((i+1)*(1000))])) print(i) pred1.extend(knn1.predict(trainf[67000:67946])) print "time: " + str(time.time() - tim) #knn = neighbors.KNeighborsRegressor(n_neighbors=k) #knn.fit(pred1,trainlab) ridge = Ridge(alpha=1.0) ridge.fit(pred1, trainlab) n = 10 pred2 = [] for i in range(0,n): pred2.extend(knn1.predict(testf[(i*1000):((i+1)*(1000))].toarray())) print(i) n = 10 pred = [] for i in range(0,n): pred.extend(ridge.predict(pred2[(i*1000):((i+1)*(1000))])) print(i) #RMSE: testlab = np.array(test.ix[:,4:]) err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0)))) return err
def cross_valid(X,Y,n_fold): clf = Ridge(alpha=1.0) total_mean_square = 0 total_coef = 0 Y_np = np.array(Y) n_samples, n_features = len(X), len(X[0]) kf_Y = cross_validation.KFold(n_samples, n_fold) index = [] preds = [] truths = [] for train_index, test_index in kf_Y: X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y_np[train_index], Y_np[test_index] clf.fit(X_train,y_train) y_pred = clf.predict(X_test) index += test_index.tolist() preds += map(lambda x: 1 if x > 0.5 else 0 ,y_pred.tolist()) truths += y_test.tolist() #print "predict:",map(lambda x: 1 if x > 0.5 else 0,y_pred) #print "original:",y_test total_mean_square += mean_squared_error(y_test,y_pred) total_coef += clf.coef_ #print 'Coefficient of the prediction (pearsonr): ' , pearsonr(y_pred,y_test) print 'All Coefficient of the prediction (pearsonr): ' , pearsonr(truths,preds) print 'Average mean squared error is: ' , total_mean_square / n_fold diff_count = sum([abs(truth - pred) for truth, pred in zip(truths, preds)]) acc = 100-1.* diff_count/len(truths)*100 print 'prediction accuracy is %f'%(acc) return [total_coef, index , preds]
def __init__(self, penalty='l1', dual=None, C=None, alpha=None): self.l1 = True if penalty=="l1" else False if self.l1: Lasso.__init__(self, alpha=alpha) else: Ridge.__init__(self, alpha=alpha)
def impute_age(): X, P = gfa.platform_expression("GPL96") model = impute.KNNImputer() Xi = model.fit_transform(X, axis=1) age = array(P["age"].tolist()) Xm = Xi.as_matrix() ix = array((age >= 10) & (age <= 120)).nonzero()[0] np.random.shuffle(ix) Xm = Xm[ix, :] age = age[ix] n_train = 2000 n_test = 500 # clf = SVR(C=1e-5, epsilon=1) # clf = LinearRegression() clf = Ridge() # clf = SimpleRegressor() # clf = Lasso() clf.fit(Xm[:n_train, :], age[:n_train]) y = age[n_train : (n_train + n_test)] y_hat = clf.predict(Xm[n_train : (n_train + n_test)]) dy = y - y_hat bias_tr = y_hat.mean() - age.mean() print("\nBias (vs train):\t\t", bias_tr) print("Bias (vs test):\t\t\t", dy.mean()) print("Mean error:\t\t\t", fabs(dy).mean()) print("Mean error (bias corrected):\t", fabs(dy - bias_tr).mean()) print("MSE:\t\t\t\t", np.power(dy, 2).mean())
def RidgeRegression(self,filename,outputFile): pheno,geno = self.inputParse(filename) for row in geno: if len(row)%2 !=0: return "Rows are not even." maxGeno = max(geno) allGeno = list(set(maxGeno)) encoder = [i for i in range(len(allGeno))] lengthGeno = len(geno) length = len(geno) lenInnerGeno = len(geno[0]) genoMake = [0 for x in range(len(allGeno))] dictionary = dict(zip(allGeno,encoder)) for i in range(length): for x in range(lenInnerGeno): geno[i][x] = dictionary[geno[i][x]] phenoNaN = [] for i in range(len(pheno)): if pheno[i] == 'NaN': phenoNaN.append(i) phenoNaN.reverse() for i in phenoNaN: del pheno[i] genoMiss = [] for i in range(len(geno)): if i not in phenoNaN: genoMiss.append(geno[i]) pheno = [float(i) for i in pheno] alpha = self.alphaOptimization(genoMiss,pheno) clf = Ridge(alpha = alpha) clf.fit(genoMiss,pheno) predicted = clf.predict(geno) predicted = np.transpose(predicted) np.savetxt(outputFile,np.transpose(predicted))
def traverse_movies_ridge(): LBMAP = getLBMap() DMAP = createEmpty() P_ERRORS, ERRORS = [], [] training_data, training_response = [], [] for i in range(len(data)): movie = data[i] m_rev = movie['revenue'] myvector = vectorizeMovie(movie, LBMAP, DMAP) if i > 100: model = Ridge(alpha = .5) model.fit(training_data, training_response) raw = math.fabs(model.predict(myvector) - m_rev) ERRORS.append(raw) #P_ERRORS.append(round(raw/m_rev, 4)) training_data.append(myvector) training_response.append(m_rev) DMAP = update(movie, DMAP) #print 'all', avg_float_list(P_ERRORS) print 'all', avg_float_list(ERRORS)
class RidgeRegressionModel(LinearLeastSquaresModel): def __init__(self, input_columns, output_columns, debug=False): self.alpha = 0.0000000001 self.m = Ridge(alpha=self.alpha) super(RidgeRegressionModel, self).__init__(input_columns, output_columns, debug=debug) def fit(self, data): A = numpy.vstack([data[:,i] for i in self.input_columns]).T B = numpy.vstack([data[:,i] for i in self.output_columns]).T self.m.fit(A, B) return self.m.coef_ #m.intercept_ def get_error(self, data, model): A = numpy.vstack([data[:,i] for i in self.input_columns]).T B = numpy.vstack([data[:,i] for i in self.output_columns]).T B_fit = scipy.dot(A, model) err_per_point = numpy.sum((B-B_fit)**2, axis=1) # sum squared error per row norm = numpy.sqrt(model*model) assert norm.shape == (1,1) regularizer = 1.0*norm[0,0] return err_per_point - regularizer
def add_weekly_overall_trends(df_shop, regressor, trend_name, coeff_name, target='pays_count'): biweek_max = df_shop.biweek_id.max() df_shop[trend_name] = np.nan df_shop[coeff_name] = np.nan for m in range(biweek_max - 1, 0, -1): train_idx = df_shop.biweek_id >= m test_idx = df_shop.biweek_id == (m - 1) df_train = df_shop[train_idx] y = df_train[target] not_null = ~y.isnull() if not_null.sum() < 7: continue x = -df_train[regressor] x_not_null = x[not_null].values.reshape(-1, 1) y = y[not_null].values lr = Ridge(alpha=1).fit(x_not_null, y) if m == biweek_max - 1: x = x.values.reshape(-1, 1) df_shop.loc[train_idx, trend_name] = lr.predict(x) df_shop.loc[train_idx, coeff_name] = lr.coef_[0] df_test = df_shop[test_idx] x = -df_test[regressor].values.reshape(-1, 1) df_shop.loc[test_idx, trend_name] = lr.predict(x) df_shop.loc[test_idx, coeff_name] = lr.coef_[0]
def _check_ridge_model(featureses, labels): """Plot ridge regression predictions""" for tfidf_count in FEATURES_SIZES: test_points = [] for i in range(16): tmp = [i, 100] tmptmp = [0] * tfidf_count if tmptmp: tmp.extend(tmptmp) test_points.append(tmp) test_points = np.array(test_points) limit = tfidf_count + 2 model = Ridge() model.fit(featureses[:, :limit], labels) predictions = model.predict(test_points) plt.plot( predictions, label=str(tfidf_count), linestyle=next(LINECYCLER), linewidth=3) # plt.text(test_points[-1, 0], predictions[-1], str(tfidf_count)) plt.legend() plt.xlabel('Document order') plt.ylabel('Time (seconds)') plt.savefig('ridge_predictions.pdf')
def add_overall_trend_feature(df_shop, target='pays_count'): biweek_max = df_shop.biweek_id.max() trend_name = 'trend_overall' coeff_name = 'trend_overall_coeff' df_shop[trend_name] = np.nan df_shop[coeff_name] = np.nan for m in range(biweek_max - 1, 0, -1): train_idx = df_shop.biweek_id >= m test_idx = df_shop.biweek_id == (m - 1) df_train = df_shop[train_idx] y = df_train[target] not_null = ~y.isnull() if not_null.sum() <= 7: continue x = df_train.days_from_beginning x_not_null = x[not_null].values.reshape(-1, 1) y = y[not_null].values lr = Ridge(alpha=1).fit(x_not_null, y) if m == biweek_max - 1: x = x.values.reshape(-1, 1) df_shop.loc[train_idx, trend_name] = lr.predict(x) df_shop.loc[train_idx, coeff_name] = lr.coef_[0] df_test = df_shop[test_idx] x = df_test.days_from_beginning.values.reshape(-1, 1) df_shop.loc[test_idx, trend_name] = lr.predict(x) df_shop.loc[test_idx, coeff_name] = lr.coef_[0]
def add_window_trend_overall_features(df_shop, target='pays_count'): biweek_max = df_shop.biweek_id.max() for biweeks_past in [2, 3, 4, 5, 6, 12, 18]: trend_name = 'trend_%d' % biweeks_past trend_coef_name = 'trend_coef_%d' % biweeks_past df_shop[trend_name] = np.nan df_shop[trend_coef_name] = np.nan for m in range(biweek_max, biweeks_past, -1): m_past = m - biweeks_past train_idx = (df_shop.biweek_id >= m_past) & (df_shop.biweek_id <= m) test_idx = df_shop.biweek_id == (m_past - 1) df_rolling_train = df_shop[train_idx] df_rolling_test = df_shop[test_idx] y = df_rolling_train[target] not_null = ~y.isnull() if not_null.sum() <= 7: continue x = df_rolling_train.days_from_beginning x_not_null = x[not_null].values.reshape(-1, 1) y = y[not_null].values lr = Ridge(alpha=1).fit(x_not_null, y) if m == biweek_max: x = x.values.reshape(-1, 1) df_shop.loc[train_idx, trend_name] = lr.predict(x) df_shop.loc[train_idx, trend_coef_name] = lr.coef_[0] x_val = df_rolling_test.days_from_beginning.values.reshape(-1, 1) df_shop.loc[test_idx, trend_name] = lr.predict(x_val) df_shop.loc[test_idx, trend_coef_name] = lr.coef_[0]
def ridge_regressor(df): """ INPUT: Pandas dataframe OUTPUT: R^2 and Mean Absolute Error performance metrics, feature coefficients """ y = df.pop("price").values X = df.values feature_names = df.columns xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0) clf = Ridge(alpha=1.0) clf.fit(xtrain, ytrain) score = clf.score(xtest, ytest) feat_imps = clf.coef_ ypredict = clf.predict(xtest) mae = np.mean(np.absolute(ytest - ypredict)) mae_percent = np.mean(np.absolute(ytest - ypredict) / ytest) return ( "R^2 is ", score, "RMSE is ", rmse, "MAE percent is ", mae_percent, "Feature coefficients are ", zip(feature_names, feat_imps), )
def ridge_regression(data,target,alphas): plt.figure() mean_rmses=[] kf=KFold(len(target),10,True,None) for alpha0 in alphas: rmses=[] clf=Ridge(alpha=alpha0,normalize=True,solver='svd') for train_index, test_index in kf: data_train,data_test=data[train_index],data[test_index] target_train,target_test=target[train_index],target[test_index] clf.fit(data_train,target_train) rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2)) rmses.append(rmse) mean_rmses.append(np.mean(rmses)) x0=np.arange(1,11) plt.plot(x0,rmses,label='alpha='+str(alpha0),marker='o') lr = linear_model.LinearRegression(normalize = True) rmses = [] for train_index, test_index in kf: data_train, data_test = data[train_index], data[test_index] target_train, target_test = target[train_index], target[test_index] lr.fit(data_train, target_train) rmse = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2)) rmses.append(rmse) mean_rmses.append(np.mean(rmses)) x0=np.arange(1,11) plt.plot(x0,rmses,label='linear',marker='*') plt.title("RMSE comparison between different alpha values of Ridge regularization") plt.legend() plt.show() # print(mean_rmses) return mean_rmses
def fit_strf_ridge(input, output, lags, alpha=1.0, verbose=False): #convert the input into a toeplitz-like matrix if verbose: nt,nf = input.shape nelems = nt*nf*len(lags) mem = (nelems*8.) / 1024.**2 print '[fit_strf_ridge] estimated size of toeplitz matrix: %d MB' % mem stime = time.time() A = make_toeplitz(input, lags, include_bias=False) etime = time.time() - stime if verbose: print '[fit_strf_ridge] Time to make Toeplitz matrix: %d seconds' % etime #fit the STRF stime = time.time() #rr = Ridge(alpha=alpha, copy_X=False, fit_intercept=True) rr = Ridge(alpha=alpha, fit_intercept=True) rr.fit(A, output) etime = time.time() - stime if verbose: print '[fit_strf_ridge] Time to fit STRF: %d seconds' % etime #reshape the STRF so that it makes sense nt = input.shape[0] nf = input.shape[1] d = len(lags) strf = np.array(rr.coef_).reshape([nf, d]) bias = rr.intercept_ return strf,bias
cat_cols = all_columns[kinds == 'O'] num_pipe = Pipeline([('si', SimpleImputer(strategy='mean')), ('ss', StandardScaler())]) ct = ColumnTransformer([('num', num_pipe, num_cols)]) X_num_tf = ct.fit_transform(train) #------------------------------------------------------------------------------ ct = ColumnTransformer([('cat', cat_pipe, cat_cols), ('num', num_pipe, num_cols)]) X = ct.fit_transform(train) #------------------------------------------------------------------------------ ml_pipe = Pipeline([('transform', ct), ('ridge', Ridge())]) ml_pipe.fit(train, y) ml_pipe.score(train, y) #------------------------------------------------------------------------------- kf = KFold(n_splits=5, shuffle=True, random_state=123) cross_val_score(ml_pipe, train, y, cv=kf).mean() #------------------------------------------------------------------------------- param_grid = { 'transform__num__si__strategy': ['mean', 'median'], 'ridge__alpha': [.001, 0.1, 1.0, 5, 10, 50, 100, 1000] } gs = GridSearchCV(ml_pipe, param_grid, cv=kf, return_train_score=True)
from sklearn.impute import SimpleImputer numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0) ), ('scaler', StandardScaler())]) categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='NA') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) ct = ColumnTransformer( transformers=[('numeric', numeric_transformer, continuous), ('cats', categorical_transformer, cats)]) lr_pipe = make_pipeline(ct, LinearRegression()) ridge_pipe = make_pipeline(ct, Ridge()) lasso_pipe = make_pipeline(ct, Lasso()) elastic_pipe = make_pipeline(ct, ElasticNet()) ### Fitting # Only using subset of categorical variables X_train[cats].dtypes X_train = X_train.loc[:, continuous + cats] X_train.shape X_test.shape X_test = X_test.loc[:, continuous + cats]
svr.fit(X_train_scaled, y_train) svr_preds = svr.predict(X_val_scaled) print('average error: $', round(np.sqrt(mean_squared_error(y_val, svr_preds)), 2), 'RMSE') print('average percent error:', round(MAPE(y_val, svr_preds), 2), '%') ##Bayesian Model: from sklearn.linear_model import BayesianRidge br_reg = BayesianRidge() br_reg.fit(X_train_scaled, y_train) br_preds = br_reg.predict(X_val_scaled) print('average error: $', round(np.sqrt(mean_squared_error(y_val, br_preds)), 2), 'RMSE') print('average percent error:', round(MAPE(y_val, br_preds), 2), '%') ##Ridge Regression: from sklearn.linear_model import Ridge rid_reg = Ridge() rid_reg.fit(X_train_scaled, y_train) rid_preds = rid_reg.predict(X_val_scaled) print('average error: $', round(np.sqrt(mean_squared_error(y_val, rid_preds)), 2), 'RMSE') print('average percent error:', round(MAPE(y_val, rid_preds), 2), '%') ##Elastic Net: from sklearn.linear_model import ElasticNet en_reg = ElasticNet() en_reg.fit(X_train_scaled, y_train) en_preds = en_reg.predict(X_val_scaled) print('average error: $', round(np.sqrt(mean_squared_error(y_val, en_preds)), 2), 'RMSE') print('average percent error:', round(MAPE(y_val, en_preds), 2), '%') #Plot the predictions: import matplotlib.pyplot as plt
os.makedirs('./outputs', exist_ok=True) boston_data = datasets.load_boston() run = Run.get_context() client = ExplanationClient.from_run(run) X_train, X_test, y_train, y_test = train_test_split(boston_data.data, boston_data.target, test_size=0.2, random_state=0) alpha = 0.5 # Use Ridge algorithm to create a regression model reg = Ridge(alpha) model = reg.fit(X_train, y_train) preds = reg.predict(X_test) run.log('alpha', alpha) model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha) # save model in the outputs folder so it automatically get uploaded with open(model_file_name, 'wb') as file: joblib.dump(value=reg, filename=os.path.join('./outputs/', model_file_name)) # Explain predictions on your local machine tabular_explainer = TabularExplainer(model, X_train, features=boston_data.feature_names)
# Import necessary modules from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score from ridge_display_plot import display_plot # Setup the array of alphas and lists to store scores alpha_space = np.logspace(-4, 0, 50) ridge_scores = [] ridge_scores_std = [] # Create a ridge regressor: ridge ridge = Ridge(normalize=True) # Compute scores over range of alphas for alpha in alpha_space: # Specify the alpha value to use: ridge.alpha ridge.alpha = alpha # Perform 10-fold CV: ridge_cv_scores ridge_cv_scores = cross_val_score(ridge, X, y, cv=10) # Append the mean of ridge_cv_scores to ridge_scores ridge_scores.append(np.mean(ridge_cv_scores)) # Append the std of ridge_cv_scores to ridge_scores_std ridge_scores_std.append(np.std(ridge_cv_scores)) # Display the plot display_plot(ridge_scores, ridge_scores_std)
train = sample_feature[continuous_feature_names + ['price']] train_X = train[continuous_feature_names] train_y = train['price'] # 对标签进行log(X+1变换),使其更贴近正态分布 train_y_ln = np.log(train_y + 1) from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso LinearRegressionModel = LinearRegression(normalize=True) LinearRegressionModel = LinearRegressionModel.fit(train_X, train_y_ln) RidgeModel = Ridge(normalize=True) RidgeModel = RidgeModel.fit(train_X, train_y_ln) LassoModel = Lasso().fit(train_X, train_y_ln) # 非线性模型 from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor DecisionTreeModel = DecisionTreeRegressor().fit(train_X, train_y_ln) RandomForestModel = RandomForestRegressor().fit(train_X, train_y_ln) GradientBoostingModel = GradientBoostingRegressor().fit(train_X, train_y_ln) f = open('./models/LinearRegressionModel.pkl', 'xb')
def train_model(data, ridge_args): reg_model = Ridge(**ridge_args) reg_model.fit(data["train"]["X"], data["train"]["y"]) return reg_model
plt.title("Relationship between RM and Price") plt.savefig('./RMxPRICE.png', dpi=400) plt.scatter(bos.PTRATIO, bos.PRICE) plt.xlabel("Pupil-to-Teacher Ratio (PTRATIO)") plt.ylabel("Housing Price") plt.title("Relationship between PTRATIO and Price") plt.savefig('./PTRATIOxPRICE.png', dpi=400) # We drop the price from the original dataset as it is the target X = data.drop('PRICE', axis = 1) # Train model from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_squared_error model = Ridge() parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]} ridge_regressor = GridSearchCV(model, parameters, scoring = 'neg_mean_squared_error', cv = 5) ridge_regressor.fit(X, data.PRICE) # Checking for the error( in this case, Mean Squared Error) MSE = mean_squared_error(data.PRICE, model.predict(X)) print MSE
def get_stocks(st): # Find one record of data from the mongo database # @TODO: YOUR CODE HERE! session = Session(engine) stocks = session.execute("select * from stocks ") #return render_template("index.html", listings=listings) # Return template and data resdata = [{}] responsedata = {'respdata': resdata} session.close() print('Hello this is test') df = pd.read_csv("static/data/" + st + ".csv") # Drop the null columns where all values are null df = df.dropna(axis='columns', how='all') # Drop the null rows # This is for the MinMax Linear Regression model print(df.head()) df = df.dropna() print(df.head()) y = df["Open"].values.reshape(-1, 1) diff = df['Close'] - df["Open"] diff_locations = [] for i in diff: if (i < 0): diff_locations.append(0) else: diff_locations.append(1) df['diff'] = pd.DataFrame(diff_locations) #X = df[['High', 'Low', 'Close', 'Volume','diff']] X = df[['High', 'Low', 'Close', 'Volume', 'diff']] print(X) print(y) print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) X_minmax = MinMaxScaler().fit(X_train) y_minmax = MinMaxScaler().fit(y_train) X_train_minmax = X_minmax.transform(X_train) X_test_minmax = X_minmax.transform(X_test) y_train_minmax = y_minmax.transform(y_train) y_test_minmax = y_minmax.transform(y_test) model2 = LinearRegression() model2.fit(X_train_minmax, y_train_minmax) print(f"Testing Data Score: {model2.score(X_test_minmax, y_test_minmax)}") minmax_predict = model2.score(X_test_minmax, y_test_minmax) print(minmax_predict) #This is standard scalar transformation X_scaler = StandardScaler().fit(X_train) y_scaler = StandardScaler().fit(y_train) X_train_scaled = X_scaler.transform(X_train) X_test_scaled = X_scaler.transform(X_test) y_train_scaled = y_scaler.transform(y_train) y_test_scaled = y_scaler.transform(y_test) model = LinearRegression() model.fit(X_train_scaled, y_train_scaled) predictions = model.predict(X_test_scaled) scallar_MSE = mean_squared_error(y_test_scaled, predictions) scallar_r2 = model.score(X_test_scaled, y_test_scaled) plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data") plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data") #plt.legend() plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max()) plt.title("Residual Plot") #plt.show() pwd = os.getcwd() print(pwd) #p = Path(os.getcwd()+"\static\images") plt.savefig("static/images/" + st + ".png") f = open("static/images/" + st + ".png") plt.close() f.close() #Lasso model ### BEGIN SOLUTION lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled) lasso_predictions = lasso.predict(X_test_scaled) lasso_MSE = mean_squared_error(y_test_scaled, lasso_predictions) lasso_r2 = lasso.score(X_test_scaled, y_test_scaled) ### END SOLUTION print(f"Lasso MSE: {lasso_MSE}, R2: {lasso_r2}") #Ridge model ridgeVal = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled) ridge_predictions = ridgeVal.predict(X_test_scaled) ridge_MSE = mean_squared_error(y_test_scaled, ridge_predictions) ridge_r2 = ridgeVal.score(X_test_scaled, y_test_scaled) print(f"ridge MSE: {ridge_MSE}, R2: {ridge_r2}") #elasticNet elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled) elasticnet_predictions = elasticnet.predict(X_test_scaled) elasticnet_MSE = mean_squared_error(y_test_scaled, elasticnet_predictions) elasticnet_r2 = elasticnet.score(X_test_scaled, y_test_scaled) print(f"elasticnet MSE: {elasticnet_MSE}, R2: {elasticnet_r2}") fig1 = plt.figure(figsize=(12, 6)) axes1 = fig1.add_subplot(1, 2, 1) axes2 = fig1.add_subplot(1, 2, 2) axes1.set_title("Original Data") axes2.set_title("Scaled Data") maxx = X_train["High"].max() maxy = y_train.max() axes1.set_xlim(-maxx + 1, maxx + 1) axes1.set_ylim(-maxy + 1, maxy + 1) axes2.set_xlim(-2, 2) axes2.set_ylim(-2, 2) set_axes(axes1) set_axes(axes2) axes1.scatter(X_train["High"], y_train) axes2.scatter(X_train_scaled[:, 0], y_train_scaled[:]) p = Path(os.getcwd() + "/static/images") #q = p / "axes2"+st+".png" #if (q.exists()): fig1.savefig("static/images/axes2" + st + ".png") f = open("static/images/axes2" + st + ".png") plt.close() f.close() #else: # fig1.savefig("static/images/axes2"+st+".png") # plt.close() return render_template("indexStocks.html", stocks=stocks, responsedata=responsedata, init_page="initpage", sel_stk=st, minmax_predict=minmax_predict, scallar_MSE=scallar_MSE, scallar_r2=scallar_r2, lasso_MSE=lasso_MSE, lasso_r2=lasso_r2, ridge_MSE=ridge_MSE, ridge_r2=ridge_r2, elasticnet_MSE=elasticnet_MSE, elasticnet_r2=elasticnet_r2)
mean_cols = all_dummy_df.mean() all_dummy_df = all_dummy_df.fillna(mean_cols) numeric_cols = all_df.columns[all_df.dtypes!="object"] numeric_cols_means = all_dummy_df.loc[:,numeric_cols].mean() numeric_col_std = all_dummy_df.loc[:,numeric_cols].std() all_dummy_df.loc[:,numeric_cols] = (all_dummy_df.loc[:,numeric_cols]-numeric_cols_means)/numeric_col_std dummy_train_df = all_dummy_df.loc[train_df.index] dummy_test_df = all_dummy_df.loc[test_df.index] from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score X_train = dummy_train_df.values X_test = dummy_test_df.values alphas = np.logspace(-3,2,50) test_scores = [] for alpha in alphas: clf = Ridge(alpha) test_score = np.sqrt(-cross_val_score(clf,X_train,y_train,cv=10,scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) import matplotlib.pyplot as plt # %matplotlib inline # print(np.array(test_scores).shape) plt.plot(alphas,test_scores) plt.title("Alpha vs Error") plt.show() from sklearn.ensemble import RandomForestRegressor max_features = [.1, .3, .5, .7, .9, .99] test_scores = [] for max_feat in max_features: clf = RandomForestRegressor(n_estimators=200, max_features=max_feat) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score))
from sklearn.model_selection import GridSearchCV if __name__ == "__main__": # pandas读入 data = pd.read_csv('10.Advertising.csv') # TV、Radio、Newspaper、Sales # x = data[['TV', 'Radio', 'Newspaper']] x = data[['TV', 'Radio']] y = data['Sales'] # print x # print y x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2017) # model = Lasso() model = Ridge() #貌似ridge MSE小一点 alpha_can = np.logspace(-3, 2, 10) lasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5) lasso_model.fit(x_train, y_train) print '超参数:', lasso_model.best_params_ y_hat = lasso_model.predict(np.array(x_test)) print '参数最佳评分:', lasso_model.score(x_test, y_test) mse = np.average((y_hat - np.array(y_test))**2) # Mean Squared Error rmse = np.sqrt(mse) # Root Mean Squared Error print mse, rmse # t = np.arange(len(x_test)) # mpl.rcParams['font.sans-serif'] = [u'simHei'] # mpl.rcParams['axes.unicode_minus'] = False # plt.plot(t, y_test, 'r-', linewidth=2, label=u'真实数据')
def upload_get_stocks(st): # Find one record of data from the mongo database # @TODO: YOUR CODE HERE! #cr = csv.reader(open("https://query1.finance.yahoo.com/v7/finance/download/"+st+"?period1=1454112000&period2=1611964800&interval=1d&events=history&includeAdjustedClose=true","rb")) #data = pd.read_csv('https://example.com/passkey=wedsmdjsjmdd') #df = pd.read_csv("static/data/"+st+".csv") #with open("static/data/"+st+".csv", "wt") as fp: # writer = csv.writer(fp) # # writer.writerow(["your", "header", "foo"]) # write header # writer.writerows(data) #dateval = datetime.date.strtime("%D") #print(dateval) session = Session(engine) stock = session.execute("select * from stocks where symbol='" + st + "'") #return render_template("index.html", listings=listings) # Return template and data if (stock.rowcount == 0): data = pd.read_csv( "https://query1.finance.yahoo.com/v7/finance/download/" + st + "?period1=1454112000&period2=1611964800&interval=1d&events=history&includeAdjustedClose=true", sep=',') data.to_csv("static/data/" + st + ".csv", index=False, header=True) print(data) session.execute("INSERT INTO stocks VALUES ('" + st + "', '" + st + " Corp')") session.execute("commit") stocks = session.execute("select * from stocks") resdata = [{}] responsedata = {'respdata': resdata} session.close() print('Hello this is test') data = pd.read_csv("static/data/" + st + ".csv") df = data # Drop the null columns where all values are null df = df.dropna(axis='columns', how='all') # Drop the null rows # This is for the MinMax Linear Regression model print(df.head()) df = df.dropna() print(df.head()) y = df["Open"].values.reshape(-1, 1) diff = df['Close'] - df["Open"] diff_locations = [] for i in diff: if (i < 0): diff_locations.append(0) else: diff_locations.append(1) df['diff'] = pd.DataFrame(diff_locations) #X = df[['High', 'Low', 'Close', 'Volume','diff']] X = df[['High', 'Low', 'Close', 'Volume', 'diff']] print(X) print(y) print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) X_minmax = MinMaxScaler().fit(X_train) y_minmax = MinMaxScaler().fit(y_train) X_train_minmax = X_minmax.transform(X_train) X_test_minmax = X_minmax.transform(X_test) y_train_minmax = y_minmax.transform(y_train) y_test_minmax = y_minmax.transform(y_test) model2 = LinearRegression() model2.fit(X_train_minmax, y_train_minmax) print(f"Testing Data Score: {model2.score(X_test_minmax, y_test_minmax)}") minmax_predict = model2.score(X_test_minmax, y_test_minmax) print(minmax_predict) #This is standard scalar transformation X_scaler = StandardScaler().fit(X_train) y_scaler = StandardScaler().fit(y_train) X_train_scaled = X_scaler.transform(X_train) X_test_scaled = X_scaler.transform(X_test) y_train_scaled = y_scaler.transform(y_train) y_test_scaled = y_scaler.transform(y_test) model = LinearRegression() model.fit(X_train_scaled, y_train_scaled) predictions = model.predict(X_test_scaled) scallar_MSE = mean_squared_error(y_test_scaled, predictions) scallar_r2 = model.score(X_test_scaled, y_test_scaled) plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data") plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data") #plt.legend() plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max()) plt.title("Residual Plot") #plt.show() pwd = os.getcwd() print(pwd) #p = Path(os.getcwd()+"\static\images") plt.savefig("static/images/" + st + ".png") f = open("static/images/" + st + ".png") plt.close() f.close() #Lasso model ### BEGIN SOLUTION lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled) lasso_predictions = lasso.predict(X_test_scaled) lasso_MSE = mean_squared_error(y_test_scaled, lasso_predictions) lasso_r2 = lasso.score(X_test_scaled, y_test_scaled) ### END SOLUTION print(f"Lasso MSE: {lasso_MSE}, R2: {lasso_r2}") #Ridge model ridgeVal = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled) ridge_predictions = ridgeVal.predict(X_test_scaled) ridge_MSE = mean_squared_error(y_test_scaled, ridge_predictions) ridge_r2 = ridgeVal.score(X_test_scaled, y_test_scaled) print(f"ridge MSE: {ridge_MSE}, R2: {ridge_r2}") #elasticNet elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled) elasticnet_predictions = elasticnet.predict(X_test_scaled) elasticnet_MSE = mean_squared_error(y_test_scaled, elasticnet_predictions) elasticnet_r2 = elasticnet.score(X_test_scaled, y_test_scaled) print(f"elasticnet MSE: {elasticnet_MSE}, R2: {elasticnet_r2}") fig1 = plt.figure(figsize=(12, 6)) axes1 = fig1.add_subplot(1, 2, 1) axes2 = fig1.add_subplot(1, 2, 2) axes1.set_title("Original Data") axes2.set_title("Scaled Data") maxx = X_train["High"].max() maxy = y_train.max() axes1.set_xlim(-maxx + 1, maxx + 1) axes1.set_ylim(-maxy + 1, maxy + 1) axes2.set_xlim(-2, 2) axes2.set_ylim(-2, 2) set_axes(axes1) set_axes(axes2) axes1.scatter(X_train["High"], y_train) axes2.scatter(X_train_scaled[:, 0], y_train_scaled[:]) p = Path(os.getcwd() + "/static/images") #q = p / "axes2"+st+".png" #if (q.exists()): fig1.savefig("static/images/axes2" + st + ".png") f = open("static/images/axes2" + st + ".png") plt.close() f.close() #else: # fig1.savefig("static/images/axes2"+st+".png") # plt.close() return render_template("indexStocks.html", stocks=stocks, responsedata=responsedata, init_page="initpage", sel_stk=st, minmax_predict=minmax_predict, scallar_MSE=scallar_MSE, scallar_r2=scallar_r2, lasso_MSE=lasso_MSE, lasso_r2=lasso_r2, ridge_MSE=ridge_MSE, ridge_r2=ridge_r2, elasticnet_MSE=elasticnet_MSE, elasticnet_r2=elasticnet_r2)
变量编码: 类别标签:LabelEncoder 无序特征编码:OneHotEncoder 有序离散特征编码:OrdinalEncoder ''' titanic = titanic[['Sex', 'Age', 'Embarked','Pclass', 'Survived']] titanic.head() enc = OneHotEncoder(categories = 'auto') oneHot_ret = enc.fit_transform(titanic[['Sex','Embarked']]).toarray() print(enc.get_feature_names()) titanic_new = pd.concat([titanic, pd.DataFrame(oneHot_ret, columns = enc.get_feature_names())], axis = 'columns').drop(['Sex', 'Embarked'], axis='columns') titanic_new x = pd.DataFrame(np.random.uniform(1,20,20).reshape((-1,2)), columns = ['x','y']) bin = Binarizer(threshold = 10) x_bin = bin.fit_transform(x) kbd = KBinsDiscretizer(n_bins = 5, encode='onehot-dense', strategy='kmeans') x_kbd = kbd.fit_transform(x) print(x_kbd) kbd.n_bins_ kbd.bin_edges_ from sklearn.linear_model import Ridge, Lasso, ElasticNet Ridge()
break model_list.append(model_n) model_collection = np.vstack((model_collection, X_tr[:, model_n])) print model_list print cur_mn print len(model_list) #choose top12 models model_list2 = model_list[0:12] test_fin = test[model_list2] train_fin = train[model_list2] #select model for stacking clf = Ridge(alpha=3.0) clf.fit(train_fin, y) pred1 = clf.predict(test_fin) pred1[pred1 < 1.] = 1. pred1[pred1 > 3.] = 3. #saved_results pd.DataFrame({ "id": id_test, "relevance": pred1 }).to_csv(MODELS_DIR + "/submissions_ensemble_n_models_from_m_11_04_2016.csv", index=False) #X_new=train_fin #import statsmodels.api as sm
f.write(text_id + "," + top_imgs_str + "\n") if img_id in list(top_imgs): count += 1 print("count", count) train = pickle.load(open('train.pkl', 'rb')) test = pickle.load(open('test.pkl', 'rb')) print(train['captions'][0]) print(train['tags'][0]) # print(train['image_2048'].shape) print("=== fit model ===") ridge = Ridge() t = time.time() ridge.fit(train['image_2048'], train['captions']) print(time.time() - t) print("=== test training data ===") predict_caps = ridge.predict(train['image_2048']) print(predict_caps.shape) ranking(train['captions'], predict_caps, train['image_id'], 'training_test_answer.csv') print("=== test testing data ===") predict_caps = ridge.predict(test['image_2048']) print(predict_caps.shape) #ranking(predict_image_features, test['image_2048'], test['image_id'], 'answer.csv') ranking(test['captions'], predict_caps, test['image_id'], 'answer.csv')
def baggingMyLasso(trainX, trainY, train_prediction_start, testX, testY, test_prediction_start, look_ahead, bag_size=47, Nestimators=50, samp_size=0.95, market=1,every=0,lookbackMethod=1, bagModels_times=50, RandomLasso=False,modelCombination='lassoLasso',otherFeats=False, top_ret_Feats=6, topFeats=3): ''' :param modelCombination: Only true if Randomized Lasso 'lassoLasso', 'lassoRidge', 'LassoSimpleLinear' :return: ''' #if Random Lass0=True: we choose variables that appear at least in 40% of the models # use 47 of estimators to do prediction, repeat 50 times # computing the best alpha for lasso: #alphas = np.linspace(0.005, 1, 1000) model = LassoCV(cv=10, eps=0.0001, fit_intercept=True, normalize=False, random_state=None).fit(trainX, trainY) print('Best Lasso alpha for lookahead %d: ' % lookahead, model.alpha_) if RandomLasso: svr = BaggingRegressor(Lasso(alpha=model.alpha_, fit_intercept=True, normalize=False, random_state=None), n_estimators=200, max_samples=0.95, bootstrap=False, random_state=None, n_jobs=-1) svr = svr.fit(trainX, trainY) ################################### coef_matrix = np.zeros((Nestimators, trainX.shape[1])) for i in range(Nestimators): coef_matrix[i, :] = \ [1 if svr.estimators_[i].coef_[j] != 0 else 0 for j in range(trainX.shape[1])] coef_freq = np.sum(coef_matrix, axis=0) / Nestimators #num_coefs = trainX.shape[1] + 1 # plt.bar(np.arange(1, num_coefs), coef_freq, color='skyblue') # plt.ylabel('Freq selected') # plt.xlabel('Feature') # plt.ylim(0, 1.3) # plt.text(2, 1.1, 'lookahead %d' % lookahead) # plt.xlim(0, num_coefs) # plt.show() if not otherFeats: ret_feat_inds=[i for i in range(len(coef_freq)) if coef_freq[i]>=0.60] print('Number of Features with more than 60 percent frequency: ', len(ret_feat_inds)) if len(ret_feat_inds) >= top_ret_Feats: trainX = trainX[:, ret_feat_inds] testX = testX[:, ret_feat_inds] else: print('No variable meets the frequency requirement! choosing the %d most frequent!'%topFeats) order = coef_freq.argsort() trainX = trainX[:, order[-top_ret_Feats:]] testX = testX[:, order[-top_ret_Feats:]] else: ret_feat_freqs = coef_freq[:lookback] otherFeat_freqs=coef_freq[lookback:] o_F_inds=np.arange(lookback,trainX.shape[1],1) ret_feat_inds = np.array([i for i in range(len(ret_feat_freqs)) if ret_feat_freqs[i] >= 0.60]) other_feat_inds = np.array( [i for i in range(lookback,len(coef_freq),1) if coef_freq[i] >= 0.60]) print('Number of return Features with more than 60 percent frequency: ', len(ret_feat_inds)) print('Number of Other Features with more than 60 percent frequency: ', len(other_feat_inds)) if len(ret_feat_inds) < top_ret_Feats: ret_order = ret_feat_freqs.argsort() ret_order=ret_order[-top_ret_Feats:] else: ret_order=ret_feat_inds print(ret_order, 'return feat indices') if len(other_feat_inds)<topFeats: otherFeat_order = otherFeat_freqs.argsort() o_F_inds=o_F_inds[otherFeat_order] otherFeat_order = o_F_inds[-topFeats:] else: otherFeat_order=other_feat_inds print(otherFeat_order, 'other feat indices') ret_order=list(ret_order) ret_order.extend(list(otherFeat_order)) trainX = trainX[:, ret_order] testX = testX[:, ret_order] print(trainX.shape,testX.shape) if modelCombination=='lassoLasso': #alphas = np.linspace(0, 1, 200) model = LassoCV(cv=10, eps=0.0001,fit_intercept=True, normalize=False, random_state=None).fit(trainX, trainY) print('Best alpha for lookahead %d: after feature Selection!' % lookahead, model.alpha_) svr = BaggingRegressor(Lasso(alpha=model.alpha_, fit_intercept=True, normalize=False, random_state=None), n_estimators=50, max_samples=0.95, bootstrap=False, random_state=None, n_jobs=-1) elif modelCombination=='LassoSimpleLinear': svr = BaggingRegressor(LinearRegression(fit_intercept=True, normalize=False), n_estimators=50, max_samples=0.95, bootstrap=False, random_state=None, n_jobs=-1) elif modelCombination=='LassoRidge': alphas = np.linspace(0.0001, 10, 200) #cv=None: a form of leave one out CV! model_CV = RidgeCV(cv=None, alphas=alphas, fit_intercept=True, normalize=False).fit(trainX, trainY) print('Best Ridge alpha for lookahead %d: after feature Selection- Lasso Ridge!' % lookahead, model_CV.alpha_) svr = BaggingRegressor(Ridge(alpha=model_CV.alpha_, normalize=False, fit_intercept=True), n_estimators=Nestimators, max_samples=samp_size, bootstrap=False, random_state=None, n_jobs=-1) else: svr = BaggingRegressor(Lasso(alpha=model.alpha_, fit_intercept=True, normalize=False, random_state=None), n_estimators=50, max_samples=0.95, bootstrap=False, random_state=None, n_jobs=-1) svr = svr.fit(trainX, trainY) ## coef_matrix = np.zeros((Nestimators, trainX.shape[1])) for i in range(Nestimators): coef_matrix[i, :] = [1 if svr.estimators_[i].coef_[j] != 0 else 0 for j in range(trainX.shape[1])] coef_freq = np.sum(coef_matrix, axis=0) / Nestimators num_coefs = trainX.shape[1] + 1 # plt.bar(np.arange(1, num_coefs), coef_freq, color='skyblue') # plt.ylabel('Freq selected') # plt.xlabel('Feature') # plt.ylim(0, 1.3) # plt.text(2, 1.1, 'lookahead %d' % lookahead) # plt.xlim(0, num_coefs) # plt.show() ### colnamesRaw = ['dtStart'] cln = [i for i in range(1, Nestimators * 2 + 3, 1)] colnamesRaw.extend(cln) colnamesBagged = ['dtStart'] cln1 = [i for i in range(1, bagModels_times * 2 + 3, 1)] colnamesBagged.extend(cln1) # (date, trainY, true_lab, pred_labs...) trainRs = np.zeros((trainX.shape[0], bagModels_times * 2 + 3)) trainRs_raw = np.zeros((trainX.shape[0], Nestimators * 2 + 3)) trainRs[:, 0] = train_prediction_start trainRs_raw[:, 0] = train_prediction_start trainRs[:, 1] = trainY trainRs_raw[:, 1] = trainY trainRs[:, 2] = [1 if trainY[i] > 0 else 0 for i in range(len(trainY))] trainRs_raw[:, 2] = [1 if trainY[i] > 0 else 0 for i in range(len(trainY))] # testRs = np.zeros((testX.shape[0], bagModels_times * 2 + 3)) testRs_raw = np.zeros((testX.shape[0], Nestimators * 2 + 3)) testRs[:, 0] = test_prediction_start testRs_raw[:, 0] = test_prediction_start testRs[:, 1] = testY testRs_raw[:, 1] = testY testRs[:, 2] = [1 if testY[i] > 0 else 0 for i in range(len(testY))] testRs_raw[:, 2] = [1 if testY[i] > 0 else 0 for i in range(len(testY))] for i in range(Nestimators): trainRs_raw[:, i + 3] = svr.estimators_[i].predict(trainX) testRs_raw[:, i + 3] = svr.estimators_[i].predict(testX) trainRs_raw[:, i + Nestimators + 3] = \ [1 if trainRs_raw[j, i + 3] > 0 else 0 for j in range(len(trainRs_raw[:, i + 3]))] testRs_raw[:, i + Nestimators + 3] = \ [1 if testRs_raw[j, i + 3] > 0 else 0 for j in range(len(testRs_raw[:, i + 3]))] # aggregating results! model_inds = [j for j in range(3, Nestimators + 3)] # print(model_inds) for i in range(bagModels_times): index_modelstoUse = np.random.choice(model_inds, bag_size, replace=False) tmp_train = trainRs_raw[:, index_modelstoUse] tmp_test = testRs_raw[:, index_modelstoUse] trainRs[:, i + 3] = np.sum(tmp_train, axis=1) testRs[:, i + 3] = np.sum(tmp_test, axis=1) trainRs[:, i + bagModels_times + 3] = \ [1 if trainRs[j, i + 3] > 0 else 0 for j in range(len(trainRs[:, i + 3]))] testRs[:, i + bagModels_times + 3] = \ [1 if testRs[j, i + 3] > 0 else 0 for j in range(len(testRs[:, i + 3]))] trainRs = pd.DataFrame(trainRs, columns=colnamesBagged) trainRs_raw = pd.DataFrame(trainRs_raw, columns=colnamesRaw) testRs = pd.DataFrame(testRs, columns=colnamesBagged) testRs_raw = pd.DataFrame(testRs_raw, columns=colnamesRaw) if every == 0: trainRs.to_csv('%d_%d_%d_Lasso_train.csv' % (market, lookbackMethod, look_ahead), index=False) trainRs_raw.to_csv('%d_%d_%d_Lasso_train_Raw.csv' % (market, lookbackMethod, look_ahead), index=False) testRs.to_csv('%d_%d_%d_Lasso_test.csv' % (market, lookbackMethod, look_ahead), index=False) testRs_raw.to_csv('%d_%d_%d_Lasso_test_Raw.csv' % (market, lookbackMethod, look_ahead), index=False) else: trainRs.to_csv('%d_%d_%d_every_%d_Lasso_train.csv' % (market, lookbackMethod, look_ahead, every), index=False) trainRs_raw.to_csv('%d_%d_%d_every_%d_Lasso_train_Raw.csv' % (market, lookbackMethod, look_ahead, every), index=False) testRs.to_csv('%d_%d_%d_every_%d_Lasso_test.csv' % (market, lookbackMethod, look_ahead, every), index=False) testRs_raw.to_csv('%d_%d_%d_every_%d_Lasso_test_Raw.csv' % (market, lookbackMethod, look_ahead, every), index=False) return coef_matrix, coef_freq
# -------------- from sklearn.linear_model import Lasso # Code starts here lasso = Lasso() lasso.fit(X_train, y_train) lasso_pred = lasso.predict(X_test) r2_lasso = r2_score(y_test, lasso_pred) print("r2_lasso", r2_lasso) # -------------- from sklearn.linear_model import Ridge # Code starts here ridge = Ridge() ridge.fit(X_train, y_train) ridge_pred = ridge.predict(X_test) r2_ridge = r2_score(y_test, ridge_pred) print("r2_ridge", r2_ridge) # Code ends here # -------------- from sklearn.model_selection import cross_val_score #Code starts here regressor = LinearRegression() score = cross_val_score(regressor, X_train, y_train, cv=10)
def construct_model(self, param, ifbest_params=0): model_name = self.model_name if model_name == "randomforest": int_params = [ 'n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf' ] for item in int_params: param[item] = int(param[item]) self.log.add(param, 1) model = RandomForestRegressor( n_estimators=param['n_estimators'], max_depth=param['max_depth'], min_samples_split=param['min_samples_split'], min_samples_leaf=param['min_samples_leaf']) elif model_name == 'gbregressor': int_params = [ 'n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf' ] for item in int_params: param[item] = int(param[item]) self.log.add(param, 1) model = GradientBoostingRegressor( learning_rate=param['learning_rate'], n_estimators=param['n_estimators'], max_depth=param['max_depth'], subsample=param['subsample'], min_samples_split=param['min_samples_split'], min_samples_leaf=param['min_samples_leaf']) elif model_name == 'xgbregressor': int_params = ['max_depth', 'num_round'] for item in int_params: param[item] = int(param[item]) self.log.add(param, 1) model = XGBRegressor(n_estimators=param['num_round'], objective=param_xgb_space['objective'], learning_rate=param['eta'], gamma=param['gamma'], min_child_weight=param['min_child_weight'], max_depth=param['max_depth'], subsample=param['subsample'], colsample_bytree=param['colsample_bytree'], seed=param_xgb_space['seed'], nthread=param_xgb_space['nthread']) elif model_name == 'lasso': model = Lasso(alpha=param['alpha'], random_state=param_lasso_space['random_state']) elif model_name == 'ridge': model = Ridge(alpha=param['alpha'], random_state=param_ridge_space['random_state']) elif model_name == 'svr': if ifbest_params == 0: model = SVR(C=param['C'], gamma=param['gamma'], degree=param['degree'], epsilon=param['epsilon'], kernel=param['kernel']) else: if (param['kernel'] == 0): cur_kernel = 'rbf' else: assert param['kernel'] == 1 cur_kernel = 'poly' model = SVR(C=param['C'], gamma=param['gamma'], degree=param['degree'], epsilon=param['epsilon'], kernel=cur_kernel) return model
def main(): boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set # within the classes ar their definition if using custom classes, or also it could be defined after declaring the # pipeline using a flat dict or a nested dict. p = Pipeline([ AddFeatures([ SKLearnWrapper( PCA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), SKLearnWrapper( FastICA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), ]), ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)})), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean() })), ) ]) print("Meta-fitting on train:") p = p.meta_fit(X_train, y_train, metastep=RandomSearch( n_iter=10, higher_score_is_better=True, validation_technique=KFoldCrossValidation( scoring_function=r2_score, k_fold=10))) # Here is an alternative way to do it, more "pipeliney": # p = RandomSearch( # p, # n_iter=15, # higher_score_is_better=True, # validation_technique=KFoldCrossValidation(scoring_function=r2_score, k_fold=3) # ).fit(X_train, y_train) print("") print("Transforming train and test:") y_train_predicted = p.transform(X_train) y_test_predicted = p.transform(X_test) print("") print("Evaluating transformed train:") score_transform = r2_score(y_train_predicted, y_train) print('R2 regression score:', score_transform) print("") print("Evaluating transformed test:") score_test = r2_score(y_test_predicted, y_test) print('R2 regression score:', score_test)
# Grid Search for Algorithm Tuning import numpy as np from sklearn import datasets from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV # load the diabetes datasets dataset = datasets.load_diabetes() # prepare a range of alpha values to test alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0]) # create and fit a ridge regression model, testing each alpha model = Ridge() grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas)) grid.fit(dataset.data, dataset.target) print(grid) # summarize the results of the grid search print(grid.best_score_) print(grid.best_estimator_.alpha)
from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train) # # Ridge Regression # In[41]: #ridge regression from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV # In[44]: ridge = Ridge() parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-2, 1, 5, 10, 20, 30, 35, 40]} ridge_regressor = GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error', cv=5) ridge_regressor.fit(X_train, y_train) # In[45]: print(ridge_regressor.best_params_) print(ridge_regressor.best_score_) # In[46]: predictions = ridge_regressor.predict(X_test)
def LinearRegression(data1, y): X_train, X_test, y_train, y_test = train_test_split(data1, y, test_size=0.2, random_state=Hcurstate) X_train_new = X_train.reset_index(drop=True) y_train_new = y_train.reset_index(drop=True) X_train_new = X_train_new.values y_train_new = y_train_new.values k = 5 kf = KFold(n_splits=k, random_state=Hcurstate) avg_train_acc, avg_test_acc = 0, 0 val_arr = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] avgsc_lst, avgsc_train_lst, avgsc_hld_lst = [], [], [] avgsc, avgsc_train, avgsc_hld = 0, 0, 0 for train_index, test_index in kf.split(X_train_new): X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[ test_index] y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[ test_index] X_train_train, X_val, y_train_train, y_val = train_test_split( X_train_cur, y_train_cur, test_size=0.25, random_state=Hcurstate) # print(X_train_train.shape) # print(X_val.shape) bestPerformingModel = Ridge(alpha=1.0, random_state=Hcurstate) bestscore = maxintval for val in val_arr: clf = Ridge(alpha=val, random_state=Hcurstate) clf = clf.fit(X_train_train, y_train_train) y_pred = clf.predict(X_val) sc = sqrt(mean_squared_error(y_pred, y_val)) if bestscore > sc: bestscore = sc bestPerformingModel = clf y_pred = bestPerformingModel.predict(X_train_cur) bscr_train = sqrt(mean_squared_error(y_pred, y_train_cur)) y_pred = bestPerformingModel.predict(X_test_cur) bscr = sqrt(mean_squared_error(y_pred, y_test_cur)) y_pred = bestPerformingModel.predict(X_test) bscr_hld = sqrt(mean_squared_error(y_pred, y_test)) avgsc_train_lst.append(bscr_train) avgsc_lst.append(bscr) avgsc_hld_lst.append(bscr_hld) avgsc_train = avgsc_train + bscr_train avgsc = avgsc + bscr avgsc_hld = avgsc_hld + bscr_hld # print(bscr_train) # print(bscr) # print(bscr_hld) print('5-fold Train, Validation, and Test loss:') print(avgsc_train_lst) print(avgsc_lst) print(avgsc_hld_lst) print('Avg Train, Validation, and Test loss:') print(avgsc_train / k) print(avgsc / k) print(avgsc_hld / k) return avgsc_train_lst, avgsc_lst, avgsc_hld_lst
preprocessor = make_column_transformer( (OneHotEncoder(drop='if_binary'), categorical_columns), remainder='passthrough') ############################################################################## # To describe the dataset as a linear model we use a ridge regressor # with a very small regularization and to model the logarithm of the WAGE. from sklearn.pipeline import make_pipeline from sklearn.linear_model import Ridge from sklearn.compose import TransformedTargetRegressor model = make_pipeline( preprocessor, TransformedTargetRegressor(regressor=Ridge(alpha=1e-10), func=np.log10, inverse_func=sp.special.exp10)) ############################################################################## # Processing the dataset # ---------------------- # # First, we fit the model. _ = model.fit(X_train, y_train) ############################################################################## # Then we check the performance of the computed model plotting its predictions # on the test set and computing, # for example, the median absolute error of the model.
def main(): np.random.seed(42) x = np.arange(0, np.math.pi, 0.01) err = np.random.randn(len(x)) y = np.sin(x) + (x / 3)**2 + 0.1 * err #y = np.sin(x) + 0.1 * err #X = np.matrix(np.column_stack([x ** i for i in range(2, 5)] + [np.sin(x)])) #X = np.matrix(np.column_stack([x ** i for i in range(2, 6)])) X = np.matrix(np.column_stack([phi(j / 10, 0.1, x) for j in range(31)])) Xb = sm.add_constant(X) Y = np.matrix(y).T X_s = standardize(X) # ============================================================================== # OLS, Ridge, Lasso # ============================================================================== ols = OLSEstimator(Xb, Y) ols.estimate() ols.test() ridge = Ridge(alpha=10).fit(X, Y) y_ridge = ridge.predict(X) lasso = Lasso(alpha=1e-2).fit(X, Y) y_lasso = lasso.predict(X) # ============================================================================== # PCR # ============================================================================== u, d, vt = np.linalg.svd(X_s, full_matrices=False) print(d) v = vt.T M = 31 p = v.shape[1] print(p) z = [] theta = [] s = np.zeros((Y.shape[0], 1)) Y_c = center(Y) for i in range(M): z_m = X_s * v[:, i] # N x 1 z.append(z_m) theta_m = (z_m.T * Y_c / (z_m.T * z_m))[0, 0] theta.append(theta_m) s += theta_m * z_m y_pcr = s + Y.mean(axis=0) # ============================================================================== # Plot # ============================================================================== plt.rc('text', usetex=True) plt.rc('font', family='serif') # Prepare Plot plt.figure(figsize=(10, 6), dpi=300) plt.title(r"Gaussian OLS, Ridge, Lasso \& PCR", fontsize=16) plt.xlabel(r'$x$', fontsize=14) plt.ylabel(r'$y$', fontsize=14) # Plot with Legends plt.scatter(x, y, color="blue", alpha=0.1, label=r'Data') plt.plot(x, np.asarray(ols.y_hat).ravel(), color='r', alpha=0.7, label=r'OLS') plt.plot(x, y_ridge, color='g', alpha=0.7, label=r'Ridge') plt.plot(x, y_lasso, color='purple', alpha=0.7, label=r'Lasso') plt.plot(x, y_pcr, color='black', label=r'PCR(' + str(M) + ')') # Other options plt.legend(fontsize=12) plt.savefig("pcr_gaussian_" + str(M) + ".png", dpi=300)
coefficient_picks("linear_regr") regression_results(y_test, y_pred, "linear_regr") df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).apply(np.exp) print("Linear", df) model = sm.OLS(y_test, X_test).fit() print(model.summary()) # model = sm.OLS(y_train, X_train[:]).fit() # MSEs = cross_val_score(linear_regr, X_train, y_train, scoring='neg_mean_squared_error', cv=5) # ------------------------Ridge------------------------------------------------------ ridge = Ridge() parameters = {'alpha': [1e-14, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]} ridge_regr = Ridge(alpha=0.001, normalize=True, tol=1) ridge_regr.fit(X_train, y_train) y_pred = ridge_regr.predict(X_test) coefficient_picks("ridge_regr") regression_results(y_test, y_pred, "ridge_regr") df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).apply(np.exp) print("Ridge ", df) #ridge_regr = GridSearchCV(ridge, parameters, scoring = 'mean_squared_error', cv=5) #print("Ridge best params = ", ridge_regr.best_params_) #print("Ridge best score = ", ridge_regr.best_score_)
rmse = np.sqrt((((y_hat - y_true)**2).sum() / len(y_true))) rmse, mse plt.scatter(y_true, y_hat, s=10) plt.xlabel("Prices: $Y_i$") plt.ylabel("Predicted prices: $\hat{Y}_i$") plt.title # Linear Regression with Ridge & Lasso regression # 1. 라쏘, 릿지 모듈불러오기 from sklearn.linear_model import Lasso, Ridge # X_train, X_test, y_train, y_test 데이타셋 나누기, 테스트크기 33%, 랜덤스테이트 42 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # 릿지 regression 하기. 절편 있고 alpha 0.5(?) ridge = Ridge(fit_intercept=True, alpha=0.5) # 트레인 데이터로 피팅하기. ridge.fit(X_train,y_train) # ridge로 predict해서 y_hat에 결과값 저장. y_hat = ridge.predict(X_test) y_true = y_test # sklearn 내장 mse함수써서 mse에 저장. mse = sklearn.metrics.mean_squared_error(y_hat, y_true) # rmse를 식에 그대로 대입해서 rmse에 저장. rmse = np.sqrt((((y_hat - y_true)**2).sum() / len(y_true))) # 출력 rmse, mse # scatter graph로 출력. xlabel, ylabel 지정하고 title넣기. plt.scatter(y_true, y_hat, s=10) plt.xlabel("Prices: $Y_i$")
def ridge(X, y, alpha=10): ridge_01 = Ridge(alpha=alpha).fit(X, y) return ridge_01
def ridge_model(): from sklearn.linear_model import Ridge parameter = ' Elastic_model ' model = Ridge(alpha=0.01) #0.5 return model, parameter
# In[8]: from sklearn.model_selection import GridSearchCV alphas = np.linspace(-1, 1, 200) lasso_grid = GridSearchCV(Lasso(), param_grid={'alpha': alphas}, cv=3) lasso_grid.fit(X_train, y) best_lasso = lasso_grid.best_estimator_ print(lasso_grid.best_params_) # In[9]: ridge_alphas = np.linspace(1, 10, 200) ridge_grid = GridSearchCV(Ridge(), param_grid={'alpha': alphas}, cv=3) ridge_grid.fit(X_train, y) best_ridge = ridge_grid.best_estimator_ print(ridge_grid.best_params_) # In[10]: def get_price_est_grid(model, X_train, X_test, y_train, alpha=0.1): price_model = model price_model.fit(X_train, y_train) y_hat = np.expm1(price_model.predict(X_test)) return pd.DataFrame(y_hat, range(1461, 1461 + 1459), ['SalePrice'])
4) Median Income, by far, has the greatest strength of correlation with the target variable. Intuitively, we may feel that it is okay to assume the following relationship Higher Median Income => Higher Median House Price and out intuition may indeed be correct, but it is important to remember that old statistical saying of "Correlation does NOT imply causation" 5) Also, Average Rooms has some degree of correlation with the target variable(Potential multicollinearity - point 2 of the list)) and Latitude has a higher degree of correlation that Longitude. These are all of the relationships that we may want to think about and analyse both statistically and in our heads. """ # Creating the basic regression models for which gridsearch and cross validation will be used linearReg = LinearRegression() lassoReg = Lasso() ridgeReg = Ridge() elasticReg = ElasticNet() parameters = {'alpha': np.linspace(0.1, 10, 50)} paramElastic = { 'alpha': np.linspace(0.1, 10, 50), 'l1_ratio': np.linspace(0.01, 1, 10) } n_folds = 10 r2_scores = [] reg_names = ["Linear", "Lasso", "Ridge", "Elastic", "Catboost"] results = {} X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=24)