def lassoRegression(X,y): print("\n### ~~~~~~~~~~~~~~~~~~~~ ###") print("Lasso Regression") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myDegree = 40 polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False) Xp = polynomialFeatures.fit_transform(X) myScaler = StandardScaler() scaled_Xp = myScaler.fit_transform(Xp) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### lassoRegression = Lasso(alpha=1e-7) lassoRegression.fit(scaled_Xp,y) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dummyX = np.arange(0,2,0.01) dummyX = dummyX.reshape((dummyX.shape[0],1)) dummyXp = polynomialFeatures.fit_transform(dummyX) scaled_dummyXp = myScaler.transform(dummyXp) dummyY = lassoRegression.predict(scaled_dummyXp) outputFILE = 'plot-lassoRegression.png' fig, ax = plt.subplots() fig.set_size_inches(h = 6.0, w = 10.0) ax.axis([0,2,0,15]) ax.scatter(X,y,color="black",s=10.0) ax.plot(dummyX, dummyY, color='red', linewidth=1.5) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def fit(self, sklearn_alpha=None, **lasso_args): """ Fit the lasso using `Lasso` from `sklearn`. This sets the attribute `soln` and forms the constraints necessary for post-selection inference by calling `form_constraints()`. Parameters ---------- sklearn_alpha : float Lagrange parameter, in the normalization set by `sklearn`. lasso_args : keyword args Passed to `sklearn.linear_model.Lasso`_ Returns ------- soln : np.float Solution to lasso with `sklearn_alpha=self.lagrange`. """ # fit Lasso using scikit-learn clf = Lasso(alpha = self.lagrange, fit_intercept = False) clf.fit(self.X, self.y, **lasso_args) self._soln = beta = clf.coef_ if not np.all(beta == 0): self.form_constraints() else: self.active = [] return self._soln
def lasso_regression(features, solutions, verbose=0): columns = solutions.columns clf = Lasso(alpha=1e-4, max_iter=5000) print('Training Model... ') clf.fit(features, solutions) feature_coeff = clf.coef_ features_importances = np.zeros((169, 3)) for idx in range(3): features_importance = np.reshape(feature_coeff[idx, :], (169, 8)) features_importance = np.max(features_importance, axis=1) features_importances[:, idx] = features_importance features_importance_max = np.max(features_importances, axis=1) features_importance_max = np.reshape(features_importance_max, (13, 13)) plt.pcolor(features_importance_max) plt.title("Feature importance for HoG") plt.colorbar() plt.xticks(arange(0.5,13.5), range(1, 14)) plt.yticks(arange(0.5,13.5), range(1, 14)) plt.axis([0, 13, 0, 13]) plt.show() print('Done Training') return (clf, columns)
def RunLASSOScikit(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) inputData = np.genfromtxt(self.dataset[0], delimiter=',') responsesData = np.genfromtxt(self.dataset[1], delimiter=',') # Get all the parameters. lambda1 = re.search("-l (\d+)", options) lambda1 = 0.0 if not lambda1 else int(lambda1.group(1)) try: with totalTimer: # Perform LASSO. model = Lasso() model.fit(inputData, responsesData) out = model.coef_ except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def lasso(self,training,target,feature_index_list): clf=Lasso(self.alpha,fit_intercept=False) clf.fit(training,target) coef=np.zeros(self.n_features) for index,feature_index in enumerate(feature_index_list): coef[feature_index]=clf.coef_[index] return coef
def reg_skl_lasso(param, data): [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data lasso = Lasso(alpha=param["alpha"], normalize=True) lasso.fit(X_tr, y_reg_tr) pred = lasso.predict(X_cv) RMSEScore = getscoreRMSE(y_reg_cv, pred) return RMSEScore, pred
class SparseSelector(BaseEstimator): """ Sparse L1 based feature selection. Parameters are passed onto sklearn.linear_model.Lasso, which actually does the work. """ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False): self.alpha = alpha self.fit_intercept = fit_intercept self.normalize = normalize self.lasso = None def fit(self, X, y): self.lasso = Lasso(alpha=self.alpha, fit_intercept=self.fit_intercept, normalize=self.normalize) self.lasso.fit(X, y) return self def transform(self, X): cols = np.nonzero(self.lasso.sparse_coef_)[1] if sp.sparse.issparse(X): return X.tocsc()[:, cols] else: return X[:, cols] def fit_transform(self, X, y): self.fit(X, y) return self.transform(X)
def traverse_movies_lasso(): LBMAP = getLBMap() DMAP = createEmpty() P_ERRORS, ERRORS = [], [] training_data, training_response = [], [] for i in range(len(data)): movie = data[i] m_rev = movie['revenue'] myvector = vectorizeMovie(movie, LBMAP, DMAP) if i > 3695: model = Lasso(alpha = .05) model.fit(training_data, training_response) raw = math.fabs(model.predict(myvector) - m_rev) ERRORS.append(raw) #P_ERRORS.append(round(raw/m_rev, 4)) training_data.append(myvector) training_response.append(m_rev) DMAP = update(movie, DMAP) #print 'all', avg_float_list(P_ERRORS) print 'all', avg_float_list(ERRORS)
def precision_recall_samples(X, y): pr_lasso = precision_recall(support.T[-1], lasso_coefs(X, y)) stability = stability_selection(X, y, pi=None) estimated = [] for st in np.unique(stability): estimated.append(stability > st - 1.e-12) pr_ss = precision_recall(support.T[-1], estimated) n_samples, n_features = X.shape alpha_max = np.max(np.dot(y, X)) / n_samples alpha = .1 * alpha_max clf = Lasso(alpha=alpha) abs_coef = np.abs(clf.fit(X, y).coef_) estimated = [] for th in np.unique(abs_coef): estimated.append(abs_coef > th - 1.e-12) pr_pt = precision_recall(support.T[-1], estimated) clf = BootstrapLasso(alpha=alpha, n_bootstraps=n_bootstraps) abs_coef = np.abs(clf.fit(X, y).coef_) estimated = [] for th in np.unique(abs_coef): estimated.append(abs_coef > th - 1.e-12) pr_bpt = precision_recall(support.T[-1], estimated) return pr_lasso, pr_ss, pr_pt, pr_bpt
def lassoreg(a): print ("Doing lasso regression") clf2 = Lasso(alpha=a) clf2.fit(base_X, base_Y) print ("Score = %f" % clf2.score(base_X, base_Y)) clf2_pred = clf2.predict(X_test) write_to_file("lasso.csv", clf2_pred)
def train(self, x, y, param_names, random_search=100, **kwargs): start = time.time() scaled_x = self._set_and_preprocess(x=x, param_names=param_names) # Check that each input is between 0 and 1 self._check_scaling(scaled_x=scaled_x) if self._debug: print "Shape of training data: ", scaled_x.shape print "Param names: ", self._used_param_names print "First training sample\n", scaled_x[0] print "Encode: ", self._encode # Do a random search alpha = self._random_search(random_iter=100, x=scaled_x, y=y) # Now train model lasso = Lasso(alpha=alpha, fit_intercept=True, normalize=False, precompute='auto', copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False) lasso.fit(scaled_x, y) self._model = lasso duration = time.time() - start self._training_finished = True return duration
def trainModel(x, y, degree=1): """Self designed Explicit method to train the model using linear regression.""" #poly = PolynomialFeatures(degree) #z = poly.fit_transform(x) #return np.dot(np.linalg.pinv(z), y) clf = Lasso(alpha=.5) clf.fit(x, y) return clf
def classify(self): """Perform classification""" clf = Lasso(max_iter=10000000) #parameters = {'alpha':[0.001,0.005,0.01,0.05,0.1,0.5,1,5.0,10.0]} #clf = GridSearchCV(lasso, parameters,scoring='roc_auc') clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels) self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
def lasso(data, targets): """ Returns a Lasso linear model for predictions with alpha 0.1 Takes the data and the associated targets as arguments. """ model = Lasso(alpha=0.1) model.fit(data, targets) return model
def varselect_w_lass(all_vars_list, selected_vars, alpha_val): lass = Lasso(alpha=alpha_val, positive=True, max_iter=100000 , tol=.0001) lass.fit(np.array(fire_train_TRAIN_smp[all_vars_list]), np.array(fire_train_TRAIN_smp.target )) for x in range(1, len(all_vars_list)): if lass.coef_[x]> .00000001: selected_vars.append(all_vars_list[x])
def weight_analysis(verbose=0, stack_option='s'): logging.info('starting ensemble weight analysis') stack = STACK if stack_option == 's' else MODELS pool = multiprocessing.Pool(processes=4) drivers = settings.DRIVER_IDS#[:1000] CUTOFF = -1 results = pool.map( compute_weights, map(lambda x: (x, verbose, stack_option), drivers) ) predictions = {} for i, get_data, model, _ in stack: predictions[i] = np.array(list(itertools.chain(*[r[1][i] for r in results]))) testY = list(itertools.chain(*[r[2] for r in results])) model_names = [ ('%s.%s.%s' % (get_data.func_name, model.__name__, i), i) for i, get_data, model, repeat in stack ] model_names.sort(key=lambda x: x[0]) keys = [x[1] for x in model_names] model_names = [x[0] for x in model_names] lasso = Lasso(alpha=0.0, positive=True) trainX = [] for row_id in xrange(len(testY)): train_row = [predictions[i][row_id] for i in keys] trainX.append(train_row) a, b = trainX[:CUTOFF], trainX[CUTOFF:] c, d = testY[:CUTOFF], testY[CUTOFF:] lasso.fit(a, c) pred = lasso.predict(b) pred_train = lasso.predict(a) #logging.info('auc: %s' % util.compute_auc(d, pred)) logging.info('coefficients:') weights = {} for i, name in enumerate(model_names): logging.info('%s: %.3f' % (model_names[i], lasso.coef_[i])) weights[keys[i]] = lasso.coef_[i] logging.info('individual scores:') for i, key in enumerate(keys): logging.info('%s: %.3f' % ( model_names[i], util.compute_auc(testY, predictions[key]) )) logging.info('weights dictionary: %s' % weights) # and again in the end, so you don't have to scroll logging.info('------------') #logging.info('auc: %s' % util.compute_auc(d, pred)) logging.info('auc train: %s' % util.compute_auc(c, pred_train))
def comparaison_ridge_lasso(X,Y): X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed()) clf_lasso = Lasso(selection='random', random_state=random.seed()) clf_ridge = Ridge() clf_lasso.fit(X_train,Y_train) clf_ridge.fit(X_train,Y_train) score_lasso=clf_lasso.score(X_test,Y_test) score_ridge=clf_ridge.score(X_test,Y_test) print("Precision de Lasso={:3.2f}% \nPrecision de Ridge={:3.2f}%\n".format(score_lasso*100,score_ridge*100))
def trainModel_phase2(x, y, degree=1): """Self designed Explicit method to train the model using linear regression.""" #poly = PolynomialFeatures(degree) #z = poly.fit_transform(x) #return np.dot(np.linalg.pinv(z), y) #clf = BernoulliRBM() #clf = LinearRegression() clf = Lasso(alpha=.5) clf.fit(x.reshape(-1, 1), y) return clf
class Linear(): def __init__(self, type='Ridge', alpha=3, C=1.0, nu=0.2, limit=None, \ epsilon=0.1): self.limit = limit if type == 'Ridge': self.model = Ridge(alpha=alpha) elif type == 'SVR': self.model = SVR(kernel='linear', C=C, epsilon=epsilon) elif type == 'NuSVR': self.model = NuSVR(C=C, nu=nu, kernel='linear') elif type == 'Lasso': self.model = Lasso(alpha=alpha) @staticmethod def get_cal(m): # get calitative features # watch out as indices depend on feature vector! return np.hstack((m[:,:23], m[:,24:37], m[:,38:52])) + 1 @staticmethod def get_cant(m): # get cantitative features # watch out as indices depend on feature vector! return np.hstack((m[:,23:24], m[:,37:38], m[:,52:])) def fit(self, train_X, train_Y): # no fitting done here, just saving data if self.limit: if len(train_X) > self.limit: train_X = train_X[-self.limit:] train_Y = train_Y[-self.limit:] self.train_X = np.array(train_X) self.train_Y = np.array(train_Y) def predict(self, test_X): # fitting done here # not efficient on the long term test_X = np.array(test_X) enc = OneHotEncoder() scal = MinMaxScaler() data = np.vstack((self.train_X, test_X)) enc.fit(self.get_cal(data)) scal.fit(self.get_cant(data)) new_train_X1 = enc.transform(self.get_cal(self.train_X)) new_train_X2 = scal.transform(self.get_cant(self.train_X)) new_train_X = scipy.sparse.hstack((new_train_X1, new_train_X2)) new_test_X1 = enc.transform(self.get_cal(test_X)) new_test_X2 = scal.transform(self.get_cant(test_X)) new_test_X = scipy.sparse.hstack((new_test_X1, new_test_X2)) self.model.fit(new_train_X, self.train_Y) R = self.model.predict(new_test_X) return R
def test_lasso_regression(): datafile_viper = '../data_viper/viper.pkl' viper = loadfile(datafile_viper) from sklearn.linear_model import Lasso model = Lasso(alpha=1e-3) model.fit(viper.train_feat, viper.train_y) y_pred = model.predict(viper.test_feat) print 'testing error {}'.format(abs_error(y_pred, viper.test_y))
def main(folds = 5): print "folds: ", folds #read in data, parse into training and target sets print "\n ------------------Load file --------------- \n" train = np.loadtxt(sys.argv[1]).T min_max_scaler = preprocessing.MinMaxScaler() train = min_max_scaler.fit_transform(train) #test data set xtest = train[100:112, :] train = train[0:100, :] print "Size of read data: ", train.shape #train = imputation_missingValue(train) print "After Standardization:" print train target = np.loadtxt(sys.argv[2]).T ytest = target[100:112, :] target = target[0:100,:] print "Size of read data: ", target.shape al = 0.3 rf = Lasso(alpha=al) #Simple K-Fold cross validation. cv = cross_validation.KFold(len(train), folds) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] i = 0 min_MSE = sys.maxint best_train = -1 best_test = -1 for traincv, testcv in cv: start = timeit.default_timer() i += 1 print i, "epoch" rf.fit(train[traincv], target[traincv]) prediction = rf.predict(train[testcv]) MSE = mean_squared_error(target[testcv], prediction) print "MSE: ", MSE, " for ",i if min_MSE > MSE: best_train = traincv best_test = testcv min_MSE = MSE results.append(MSE) stop = timeit.default_timer() print "Program running time: ", stop - start #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() ), "for folds: ", folds print "Results for independent data: ", mean_squared_error(rf.fit(train[best_train], target[best_train]).predict(xtest), ytest) print "R squared:" print "alpha:", al
def fit_predict_model(l1_penalty): RSS = np.zeros((len(l1_penalty))) num_nonzero_coeff = np.zeros((len(l1_penalty))) idx = 0 for l1_penalty_choice in l1_penalty: model = Lasso(alpha=l1_penalty_choice, normalize=True) model.fit(training[all_features], training['price']) predicted_price = model.predict(validation[all_features]) RSS[idx] = np.sum((predicted_price - validation['price'])**2) num_nonzero_coeff[idx] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_) idx += 1 return (RSS, num_nonzero_coeff, model)
def lasso_regression(data, predictors, alpha): #Fit the model lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5) lassoreg.fit(data[predictors],data['TransformedLife']) y_pred = lassoreg.predict(data[predictors]) #Return the result in pre-defined format rss = sum((y_pred-data['TransformedLife'])**2) ret = [rss] ret.extend([lassoreg.intercept_]) ret.extend(lassoreg.coef_) return ret
def lasso_regression(alpha): #Fit the model lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5) lassoreg.fit(A_x, A_y) y_pred = lassoreg.predict(A_x) #Return the result in pre-defined format rss = sum((y_pred-A_y)**2) ret = [rss] ret.extend([lassoreg.intercept_]) ret.extend(lassoreg.coef_) return ret
def basispursuit(y, F, penalty=0.1): """ solves basic (vanilla) basis pursuit using scikit-learn """ clf = Lasso(alpha=penalty, fit_intercept=False) clf.fit(F, y) xhat = clf.coef_ # reconstruct yhat = F.dot(xhat) return xhat, yhat
def fringeremoval(img_list, ref_list, mask='all', method='svd'): nimgs = len(img_list) nimgsR = len(ref_list) xdim = img_list[0].shape[0] ydim = img_list[0].shape[1] if mask == 'all': bgmask = np.ones([ydim, xdim]) # around 2% OD reduction with no mask else: bgmask = mask k = (bgmask == 1).flatten(1) # needs to be >float32 since float16 doesn't work with linalg R = np.dstack(ref_list).reshape((xdim*ydim, nimgsR)).astype(np.float32) A = np.dstack(img_list).reshape((xdim*ydim, nimgs)).astype(np.float32) # Timings: for 50 ref images lasso is twice as slow # lasso 1.00 # svd 0.54 # lu 0.54 optref_list = [] for j in range(A.shape[1]): if method == 'svd': b = R[k, :].T.dot(A[k, j]) Binv = pinv(R[k, :].T.dot(R[k, :])) # svd through pinv c = Binv.dot(b) # can also try linalg.svd() elif method == 'lu': b = R[k, :].T.dot(A[k, j]) p, L, U = lu(R[k, :].T.dot(R[k, :])) c = solve(U, solve(L, p.T.dot(b))) elif method == 'lasso': lasso = Lasso(alpha=0.01) lasso.fit(R, A) c = lasso.coef_ else: raise Exception('Invalid method.') optref_list.append(np.reshape(R.dot(c), (xdim, ydim))) return optref_list
def test_lasso_vs_graph_net(): # Test for one of the extreme cases of Graph-Net: That is, with # l1_ratio = 1 (pure Lasso), we compare Graph-Net's performance with # Scikit-Learn lasso lasso = Lasso(max_iter=100, tol=1e-8, normalize=False) graph_net = BaseSpaceNet(mask=mask, alphas=1. * X_.shape[0], l1_ratios=1, is_classif=False, penalty="graph-net", max_iter=100) lasso.fit(X_, y) graph_net.fit(X, y) lasso_perf = 0.5 / y.size * extmath.norm(np.dot( X_, lasso.coef_) - y) ** 2 + np.sum(np.abs(lasso.coef_)) graph_net_perf = 0.5 * ((graph_net.predict(X) - y) ** 2).mean() np.testing.assert_almost_equal(graph_net_perf, lasso_perf, decimal=3)
def linearReg(): sl=Lasso(alpha=0.2) sl.fit(features_array,values_array) predict_val=sl.predict(features_array) print(sl.coef_) print(sl.score(features_array,values_array)) fig = plt.figure() ax = plt.subplot(111) ax.bar(range(0,features.shape[1]),sl.coef_) plt.show()
def comparaison_moindres_carres(X,Y): X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed()) clf_lasso = Lasso(selection='random', random_state=random.seed()) clf_ridge = Ridge() clf_reg_lin = LinearRegression(n_jobs=-1) clf_lasso.fit(X_train,Y_train) clf_ridge.fit(X_train,Y_train) clf_reg_lin.fit(X_train,Y_train) Y_lasso=clf_lasso.predict(X_test) Y_ridge=clf_ridge.predict(X_test) Y_reg_lin=clf_reg_lin.predict(X_test) err_lasso=mean_squared_error(Y_test,Y_lasso) err_ridge=mean_squared_error(Y_test,Y_ridge) err_reg_lin=mean_squared_error(Y_test,Y_reg_lin) print("Erreur de Lasso={:1.2f}\nErreur de Ridge={:1.2f}\nErreur de regression lineaire={:1.2f}\n".format(err_lasso,err_ridge,err_reg_lin))
import statsmodels.api as sm X_sm = X = sm.add_constant(X) model = sm.OLS(y,X_sm) model.fit().summary() from sklearn.linear_model import LinearRegression, Lasso from sklearn.model_selection import cross_val_score lm = LinearRegression() lm.fit(X_train, y_train) print(np.mean(cross_val_score(lm, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))) # lasso regression lm_l = Lasso(alpha=.13) lm_l.fit(X_train,y_train) print(np.mean(cross_val_score(lm_l, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))) alpha = [] error = [] for i in range(1,100): alpha.append(i/100) lnl = Lasso(alpha=(i/100)) error.append(np.mean(cross_val_score(lnl, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))) # plt.plot(alpha,error) # plt.show() err = tuple(zip(alpha,error)) df_err = pd.DataFrame(err, columns=['alpha','error'])
b1 = 1 / RR.intercept_ a1 = -1 * b * RR.coef_ #print out the conservation equation of Fractal Dimension and Volatility print("Conservation Law Generated by Ridge Regression Model") print(a1[0], "FD + ", b1, "V = 1") # 0.5052215007413694 FD + 20.979416541132068 V = 1 # ########################## Lasso Regression ############################## print("\n") print("-----------------Lasso Regression ----------------") print("\n") # use sklearn.linear_model Lasso to model the Fractal Dimension and Volatility La = Lasso() # fit the Lasso model of Fractal Dimension and Volatility Lareg = La.fit(x, y) # store the coefficient of the fitted model Lw1 = La.coef_ # store the intercept of the fitted model Lw0 = La.intercept_ print("Lasso Coef: ", Lw1[0]) print("Lasso Intercept: ", Lw0) # Lasso Coef: -0.0 # Lasso Intercept: 0.012102210503849494 #use predict_list function to get the Lasso prediction of volatility V_LS_predict = predict_list(fd_list, Lw1[0], Lw0) print("Lasso Regression R_square: ", r2_score(v, V_LS_predict)) #R Square: 0.0
ridge_reg_mape = (np.abs((BL_LT_predicted - BL_LT_labels_test) / BL_LT_labels_test).mean(axis=0)) # print("ridge_reg_mape: "+str(ridge_reg_mape)) ridge_reg_rmsle = np.sqrt(mean_squared_log_error(BL_LT_labels_test, BL_LT_predicted)) # print(ridge_reg_rmsle) #################################################################################################################### # Lasso # #################################################################################################################### lasso_reg = Lasso(alpha=0.1,normalize=True) lasso_reg.fit(BL_LT_prepared_train,BL_LT_labels_train) BL_LT_predicted = lasso_reg.predict(BL_LT_prepared_test) lasso_reg_mse = mean_squared_error(BL_LT_labels_test, BL_LT_predicted) lasso_reg_rmse = np.sqrt(lasso_reg_mse) # print(lasso_reg_rmse) lasso_reg_mae = mean_absolute_error(BL_LT_labels_test, BL_LT_predicted) # print(lasso_reg_mae) lasso_reg_mape = (np.abs((BL_LT_predicted - BL_LT_labels_test) / BL_LT_labels_test).mean(axis=0)) # print("lasso_reg_mape: "+str(lasso_reg_mape)) lasso_reg_rmsle = np.sqrt(mean_squared_log_error(BL_LT_labels_test, BL_LT_predicted))
for k in range(0, nbits): for cohort in range(1, icohort + 1): val = (C[k][cohort - 1] - 0.5 * nreportspercohort[cohort - 1] * f) / (1 - f) if val < 0: val = 0 Y[k][cohort - 1] = val print(Y) print(len(ind)) Y = Y.reshape(nbits * icohort, 1) sparse_lasso = Lasso(alpha=1, fit_intercept=False) sparse_lasso.fit(X, Y) #print('---candidates---') #print(candidates) print('---client---') print(client) print('---') words = candidates[field] coefs = sparse_lasso.coef_ # print(coefs) # # for i in range(0,coefs.shape[0]): # # if(coefs[i]>0): # # print(words.iloc[i]) # print('strings selected by lasso: ') pos_client_selec = candidates[field][coefs > 0.0001] print(pos_client_selec)
from sklearn.linear_model import Lasso def plot(): plt.figure(figsize=(8,4)) plt.subplot(121) plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1), random_state=42) plt.ylabel("$y$", rotation=0, fontsize=18) plt.subplot(122) plot_model(Lasso, polynomial=True, alphas=(0, 10**-7, 1), tol=1, random_state=42) plt.show() plot() from sklearn.linear_model import Lasso lasso_reg = Lasso(alpha=0.1) lasso_reg.fit(X, y) lasso_reg.predict([[1.5]]) from sklearn.linear_model import ElasticNet elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42) elastic_net.fit(X, y) elastic_net.predict([[1.5]]) np.random.seed(42) m = 100 X = 6 * np.random.rand(m, 1) - 3 y = 2 + X + 0.5 * X**2 + np.random.randn(m, 1) X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10) poly_scaler = Pipeline([
print("cross validation time:",time2-time1) explained_variance_score = cross_val_score(lasso, X, y,cv=3,scoring='explained_variance') r2 = cross_val_score(lasso, X, y, cv=3, scoring='r2') mean_squared_error = cross_val_score(lasso, X, y, cv=3, scoring='neg_mean_squared_error') print ("EVS_CV:",explained_variance_score.mean()) print ("r2_CV:",r2.mean()) print ("MSE_CV:",mean_squared_error.mean()) """ Test/Evaluation """ time3 = time.clock() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=3) lasso.fit(X_train, y_train) y_pred= lasso.predict(X_test) time4 = time.clock() print("testing time:",time4-time3) print ("EVS_test:", metrics.explained_variance_score(y_test, y_pred)) print ("R2_test", metrics.r2_score(y_test, y_pred)) print ("MSE_test:", metrics.mean_squared_error(y_test, y_pred)) print ("The weights are:",lasso.coef_) """ Visualization """ fig, ax = plt.subplots()
def dFF_martian(data, rm_window, signal='Gcamp', reference='Isosbestic', rm_nans='fill', lambda_=100, porder=1, itermax=15): """ Calculates dF/F for a given signal and reference. This method is adapted from Martianova, Aronson, & Proulx Multi-Fiber Photometry to Record Neural activity in Freely Moving Animal. 2019 JOVE inputs args: data: pandas dataframe containing columns for signal and reference rm_window: int representing window for calculating running mean (i.e. sample freq * time window) kwargs: signal: string containing column name for signal reference: string containing column name for reference rm_nans: string indicating how NaNs should be handeled after rolling running_mean ('fill' or 'clip') lambda_: int for lambda_ value in airPLS (larger values results in smoother baseline estimation) porder: int for porder in airPLS itermax: int for maximum number of iterations for airPLS returns data: pandas dataframe containing original data, new columns with intermediate calculations, and dFF_signal """ import numpy as np import pandas as pd from ._steps import z_score, scale_Isos, calc_dF_F from ._smooth import running_mean from ._baseline_correction import WhittakerSmooth, airPLS from sklearn.linear_model import Lasso # Calculate running mean data['rm_%s' % signal] = running_mean(data[signal], rm_window) data['rm_%s' % reference] = running_mean(data[reference], rm_window) # Deal with NaN values according to rm_nan specification if rm_nans != 'clip' and rm_nans != 'fill': rm_nans = 'fill' print('Invalid input for rm_nans, defaulting to "fill"') if rm_nans == 'clip': data = data[pd.notnull(data['rm_%s' % signal])].copy() if rm_nans == 'fill': data = data.fillna(method='bfill') # Calculates baseline using airPLS and subtracts trace data['blc_%s' % reference] = data['rm_%s' % reference] - airPLS(data['rm_%s' % reference], lambda_=lambda_, porder=porder, itermax=itermax) data['blc_%s' % signal] = data['rm_%s' % signal] - airPLS(data['rm_%s' % signal], lambda_=lambda_, porder=porder, itermax=itermax) # Calculates z-scores for each trace data['z_%s' % reference] = z_score(data['blc_%s' % reference]) data['z_%s' % signal] = z_score(data['blc_%s' % signal]) # Fits a robust non-negative linear regression to reference and signal, # then scales reference lin = Lasso(alpha=0.0001, precompute=True, max_iter=1000, positive=True, random_state=9999, selection='random') lin.fit(np.array(data['z_%s' % reference]).reshape(-1, 1), np.array(data['z_%s' % signal]).reshape(-1, 1)) z_reference_fitted = lin.predict(np.array(data['z_%s' % reference] ).reshape(-1, 1)) data['scaled_%s' % reference] = list(z_reference_fitted) # caluclates dF/F as z_signal - scaled_reference data['dFF_%s' % signal] = (data['z_%s' % signal] - data['scaled_%s' % reference]) # returns dataframe with calculations in new columns return data
def test_coef_shape_not_zero(): est_no_intercept = Lasso(fit_intercept=False) est_no_intercept.fit(np.c_[np.ones(3)], np.ones(3)) assert est_no_intercept.coef_.shape == (1, )
X_m = X_m.dropna(axis=1) # mutation_names = X_m.columns X = X_m coef_names = X.columns X.to_csv('./data_outputs/Lasso_only_mut/X_' + inhibitors_list[drug_num] + '.csv') drug_response = drug_response.loc[combined_ids] Y = drug_response.sort_index() Y.to_csv('./data_outputs/Lasso_only_mut/Y_' + inhibitors_list[drug_num] + '.csv') x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) lasso = Lasso() lasso.fit(x_train, y_train) train_score = lasso.score(x_train, y_train) test_score = lasso.score(x_test, y_test) lasso_01 = Lasso(alpha=0.1, max_iter=10e5) lasso_01.fit(x_train, y_train) train_score_01 = lasso_01.score(x_train, y_train) test_score_01 = lasso_01.score(x_test, y_test) lasso_001 = Lasso(alpha=0.01, max_iter=10e5) lasso_001.fit(x_train, y_train) train_score_001 = lasso_001.score(x_train, y_train) test_score_001 = lasso_001.score(x_test, y_test) file = open( "./regression_outputs/Lasso_only_mut/" + inhibitors_list[drug_num] + ".txt", 'w+') print("Lasso: alpha = 1", file=file) print("train score: " + str(train_score), file=file)
train_rmse = np.sqrt( 1 / X_train.shape[0] * np.squeeze(np.dot((trainings - y_train).T, (trainings - y_train)))) test_rmse = np.sqrt( 1 / X_test.shape[0] * np.squeeze(np.dot((predictions - y_test).T, (predictions - y_test)))) print("Training RMSE is: %f" % train_rmse) print("Testing RMSE is: %f" % test_rmse) df_rmse['KNN'] = [train_rmse, test_rmse] # build Lasso regression model # training reg_lasso = Lasso(alpha=0.1) reg_lasso.fit(X_train, y_train) # testing trainings = reg_lasso.predict(X_train).reshape(-1, 1) predictions = reg_lasso.predict(X_test).reshape(-1, 1) # combine all predictions all_pred = np.concatenate((trainings, predictions), axis=0) # transform to dataframe for plotting df_lasso = pd.DataFrame(all_pred, columns=['Lasso ' + df.columns[-2]], index=df.index) df_lasso[df.columns[-2]] = y # plot results and add train/test split timing line
df = pd.DataFrame({'actual': y_test, 'pred': y_pred}) print(df) print(pd.DataFrame(boston_rr.coef_)) # errors print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # ------------------------------------- print("LASSO") # --- LASSO --- # boston_l = Lasso() boston_l.fit(X_train, y_train) print("Coefficients: ", boston_l.coef_) print("Intercept: ", boston_l.intercept_) # R for train and test set print('R2 for train: ', boston_l.score(X_train, y_train)) print('R2 for test: ', boston_l.score(X_test, y_test)) # lasso - prediction y_pred = boston_l.predict(X_test) df = pd.DataFrame({'actual': y_test, 'pred': y_pred}) print(df) print(pd.DataFrame(boston_l.coef_)) # errors
print(model_scores) # best alpha index for lasso print(np.argmax(model_scores)) #plus in this index to the list of alphas #best alpha to use print(alpha_space[0]) # this is th best alpha to use in the model from sklearn.linear_model import Lasso #use the alpha previously found alpha_user = 0.0001 lasso_model = Lasso(alpha=alpha_user, normalize=True) lasso_model.fit(X_train, y_train) lasso_pred = lasso_model.predict(X_test) #predicition is between 0 and 1 -- round to the nearest integer to predict if the song is a hit rounded_lasso = np.round(lasso_pred) print("Lasso Model Accuracy:", metrics.accuracy_score(y_test, rounded_lasso)) print("\nLasso Model Coefficients:", lasso_model.coef_) cols = list(X.columns.values) lasso_importance = pd.DataFrame(lasso_model.coef_, index=cols).nlargest(3, [0]) print("\nLargest Lasso coefficients:\n", lasso_importance) """The Lasso model score is similar to the linear regression model -- very low and not a good fit for the data. It's coefficients shows that the first feature, danceability, is the most important feature. This is not surprising as we are analyzing music from the 1970s # **Performance**
# get top-level correlation matrix of countries with each-other once corr_mat = light_train.transpose().corr() # iterate over 'countries' (some are not actually countries, but aggregates) for country in light_train.index: # do LASSO selection with alpha value tmp = light_train.drop(country) # narrow LASSO alpha setting for precision lasso_countries = [] lasso_fit = None log10_amin = 0 log10_amax = None for i in range(max_alpha_iter): lasso_fit = Lasso(alpha=10**log10_alpha, fit_intercept=False) lasso_fit.fit(tmp.transpose(), light_train.loc[country]) # check result nz = sum(lasso_fit.coef_ != 0) if (nz < n_nonzero): log10_amax = log10_alpha log10_alpha = (log10_amax + log10_amin) / 2 elif (nz > n_nonzero): log10_amin = log10_alpha if log10_amax is None: log10_alpha *= 2 else: log10_alpha = (log10_amax + log10_amin) / 2 else: break
def Lassos(test_data_list, train_data_list, K_fold_size): rmse_list = [] r_squared_train_list = [] r_squared_list = [] coeff_list = [] F_value = [] p = [] #res_list=[] pre = [] act = [] res = [] res_list = [] mse1 = [] m = [] k_list = [] for i in range(0, K_fold_size): test_data = test_data_list[i] train_data = train_data_list[i] y_test = test_data["Commercial-rate"] y_train = train_data["Commercial-rate"] y_train = y_train.values y_test = y_test.values test_data = test_data.drop(["Commercial-rate", "Intercept"], axis=1) test = test_data.values k_list.append(test) #print(test_data.shape) train_data = train_data.drop(["Commercial-rate", "Intercept"], axis=1) train = train_data.values r, c = test_data.shape reg = Lasso(alpha=10) reg = reg.fit(train, y_train) y_train_fitted = reg.predict(train) r_squared_train = reg.score(train, y_train) y_fitted = reg.predict(test) r_squared = reg.score(test, y_test) mse = metrics.mean_squared_error(y_test, y_fitted) mse1.append(mse) rmse = math.sqrt(mse) mse_train = metrics.mean_squared_error(y_train, y_train_fitted) rmse_train = math.sqrt(mse_train) r_squared_list.append(r_squared) rmse_list.append(rmse) means = np.mean(y_test) sum = 0 for i in range(0, len(y_test)): res_list.append(y_test[i] - y_fitted[i]) act.append(y_test) pre.append(y_fitted) res.append(res_list) for i in range(0, len(y_test)): sum += (y_test[i] - means)**2 MSR = sum / c F = MSR / mse F_value.append(F) p.append(f.pdf(F, c, r - c)) k = reg.coef_ l = (reg.intercept_) m.append(l) coeff_list.append(k) #r_s=metrics.r2_score(y_test,y_fitted) #print(r_squared,mse,rmse,r_squared_train,rmse_train) #print(k) #print(k1) return (m, coeff_list, rmse_list, r_squared_list, F_value, p, mse1, res, pre, act, k_list)
def LassoPrediction(X_train, X_test, Y_train): lasso = Lasso(alpha=0.1, normalize=True, max_iter=1e5) lasso.fit(X_train, Y_train) return lasso
print(Xtrain.shape) print(Xtest.shape) """ Output: (354, 13) (152, 13) """ ## Build the lasso model with alpha model_lasso = Lasso(alpha=1) model_lasso.fit(Xtrain, ytrain) pred_train_lasso= model_lasso.predict(Xtrain) pred_test_lasso= model_lasso.predict(Xtest) ## Evaluate the lasso model print(np.sqrt(mean_squared_error(ytrain,pred_train_lasso))) print(r2_score(ytrain, pred_train_lasso)) print(np.sqrt(mean_squared_error(ytest,pred_test_lasso))) print(r2_score(ytest, pred_test_lasso)) """ Output: 4.887113841773082 0.6657249068677625
reg_alpha=0.9, reg_lambda=0.6, subsample=0.2, seed=42, silent=1) regr.fit(train_df_munged, label_df) y_pred = regr.predict(train_df_munged) y_test = label_df print("XGBoost score on training set: ", rmse(y_test, y_pred)) y_pred_xgb = regr.predict(test_df_munged) best_alpha = 0.00099 regr = Lasso(alpha=best_alpha, max_iter=50000) regr.fit(train_df_munged, label_df) y_pred = regr.predict(train_df_munged) y_test = label_df print("Lasso score on training set: ", rmse(y_test, y_pred)) y_pred_lasso = regr.predict(test_df_munged) y_pred = (y_pred_xgb + y_pred_lasso) / 2 y_pred = np.exp(y_pred) pred_df = pd.DataFrame(y_pred, index=test_df["Id"], columns=["SalePrice"]) pred_df.to_csv('output.csv', header=True, index_label='Id')
elm = data_miss[i] j = index[i] if elm[1] == '男': miss_nan_x += [[elm[2]] + list(elm[left1:(right1 + 1)]) + list(elm[left:(right + 1)])] index_nan.append(j) elif elm[1] == '女': miss_nv_x += [[elm[2]] + list(elm[left1:(right1 + 1)]) + list(elm[left:(right + 1)])] index_nv.append(j) miss_nan_x = (np.array(miss_nan_x) - mean_nan) / std_nan miss_nv_x = (np.array(miss_nv_x) - mean_nv) / std_nv #train model model_nan = Lasso(max_iter=10000, alpha=0.01) model_nan.fit(nomiss_nan_x, nomiss_nan_y) pred_nan = model_nan.predict(miss_nan_x) pred_nan[np.argwhere(pred_nan < 0)] = 0 model_nv = Lasso(max_iter=10000, alpha=0.01) model_nv.fit(nomiss_nv_x, nomiss_nv_y) pred_nv = model_nv.predict(miss_nv_x) pred_nv[np.argwhere(pred_nv < 0)] = 0 #update data in sqlite3 col_name = '乙肝核心抗体' for i in range(len(pred_nan)): query = "update train set %s=%f where id=%d" % (col_name, pred_nan[i], index_nan[i]) curs.execute(query) for i in range(len(pred_nv)): query = "update train set %s=%f where id=%d" % (col_name, pred_nv[i],
y += 0.01 * np.random.normal((n_samples, )) # Split data in train set and test set n_samples = int(X.shape[0] / 2) print(n_samples) X_train, y_train = X[:n_samples], y[:n_samples] X_test, y_test = X[n_samples:], y[n_samples:] ############################################################################### # Lasso from sklearn.linear_model import Lasso alpha = 0.1 lasso = Lasso(alpha=alpha) y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test) r2_score_lasso = r2_score(y_test, y_pred_lasso) print(lasso) print("r^2 on test data : %f" % r2_score_lasso) ############################################################################### # ElasticNet from sklearn.linear_model import ElasticNet enet = ElasticNet(alpha=alpha, l1_ratio=0.7) y_pred_enet = enet.fit(X_train, y_train).predict(X_test) r2_score_enet = r2_score(y_test, y_pred_enet) print(enet) print("r^2 on test data : %f" % r2_score_enet)
def rmse(y_test, y_pred): return np.sqrt(mean_squared_error(y_test, y_pred)) # run prediction on training set to get an idea of how well it does y_pred = regr.predict(train_new) y_test = label_df y_pred_xgb = y_pred print("XGBoost score on training set: ", rmse(y_test, y_pred)) #XGBoost score on training set: ', 0.037633322832013358) from sklearn.linear_model import Lasso #found this best alpha through cross-validation best_alpha = 0.00099 regr = Lasso(alpha=best_alpha, max_iter=50000) regr.fit(train_new, label_df) # run prediction on the training set to get a rough idea of how well it does y_pred = regr.predict(train_new) y_pred_lasso = y_pred y_test = label_df print("Lasso score on training set: ", rmse(y_test, y_pred)) #<pre class="">('Lasso score on training set: ', 0.10175440647797629)</pre> #simple average y_pred = (y_pred_xgb + y_pred_lasso) / 2 y_pred = np.exp(y_pred) pred_df = pd.DataFrame(y_pred, index=test["Id"], columns=["SalePrice"]) pred_df.to_csv('sample_submission.csv', header=True, index_label='Id')
#Linear Regression clfreg = LinearRegression(n_jobs=1) clfreg.fit(X_train,y_train) y_pred = clfreg.predict(X_test) confidencereg = clfreg.score(X_test,y_test) #Ridge Regression rr = Ridge(alpha=0.01) rr.fit(X_train,y_train) y_pred_ridge = rr.predict(X_test) confidenceridge = rr.score(X_test,y_test) #Lasso Regression ls = Lasso() ls.fit(X_train,y_train) y_pred_lasso = ls.predict(X_test) confidencelasso = ls.score(X_test,y_test) #plotting learning curves for linear regression import matplotlib.pyplot as plt plt.plot(y_test[:100]) plt.plot(y_pred[:100]) plt.legend(['Actual', 'Linear Predicted'], loc='upper right') plt.show() #plotting learning curves for linear regression import matplotlib.pyplot as plt plt.plot(y_test[:100]) plt.plot(y_pred_ridge[:100])
index=df.index, columns=df.columns) from fancyimpute import KNN knns = {} for kind in ['dragon', 'mordred']: knns[kind] = KNN(k=5) df = dfs[kind + '_good'] imputed = knns[kind].fit_transform(df.values) dfs[kind + '_imputed'] = pd.DataFrame(imputed, index=df.index, columns=df.columns) from sklearn.linear_model import Lasso lasso = Lasso(alpha=0.1) lasso.fit(dfs['dragon_imputed'].values, dfs['mordred_imputed'].values[:, :]) predicted = lasso.predict(dfs['dragon_imputed']) observed = dfs['mordred_imputed'] rs = np.zeros(observed.shape[1]) for i, col in enumerate(observed): rs[i] = np.corrcoef(observed[col], predicted[:, i])[0, 1] # %matplotlib inline import matplotlib.pyplot as plt plt.plot(sorted(sorted(rs))) plt.plot(np.linspace(0, 1, len(lasso.coef_.ravel())), sorted(np.abs(lasso.coef_.ravel()))[::-1]) plt.xscale('log') plt.xlabel('Quantile rank (Top X% of coefficients)')
'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 0.4, 1, 5, 10, 20] } lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv=5) lasso_regressor.fit(Xs, ys) print(lasso_regressor.best_params_) print(lasso_regressor.best_score_) # In[177]: lasso = Lasso(alpha=20, normalize=False) # Fit the regressor to the data lasso.fit(Xs, ys) y_pred = ridge.predict(X_test) # # Compute and print R^2 and RMSE print("R^2: {}".format(ridge.score(X_test, y_test))) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print("Root Mean Squared Error: {}".format(rmse)) # In[178]: y_pred = lasso.predict(X_test) # # Compute and print R^2 and RMSE print("R^2: {}".format(lasso.score(X_test, y_test))) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print("Root Mean Squared Error: {}".format(rmse))
features[:,1:features.shape[1]] = f_r tf_r = robust.transform(test_features[:,1:features.shape[1]]) test_features[:,1:features.shape[1]] = tf_r enc = preprocessing.OneHotEncoder(categorical_features=[0]) enc.fit(features) fitted = enc.transform(features).toarray() features = fitted test_features = enc.transform(test_features).toarray() min_error = 1 min_idx = 1 labels = labels * 1.32 lasso = Lasso(max_iter=3000, normalize=True) lasso.fit(features, labels) guesses = lasso.predict(test_features) np.savetxt("lasso_guesses.txt", guesses, '%9.2f', newline="\n") guesses = np.array([guesses]).T diff = np.subtract(guesses, test_labels) diff = np.absolute(diff) diff = np.divide(diff, test_labels) np.savetxt("lasso_diff.txt", diff, '%9.2f', newline="\n") avg_error = np.mean(diff) print "lasso regression, error: %s" % (avg_error) fig, ax = plt.subplots() y = test_labels ax.scatter(y, guesses)
best_lasso = np.inf def get_lasso(pred, actual, coef, lambda_): lasso_val = np.sum([elem**2 for elem in pred - actual]) + np.sum( [lambda_ * np.abs(B) for B in coef]) return lasso_val for lambda_ in lambdas: lasso_reg = Lasso(normalize=True, alpha=lambda_, fit_intercept=False) lasso_reg.fit(train_X, train_Y) coef = lasso_reg.coef_ y_pred_lass = lasso_reg.predict(train_X) lasso_val = get_lasso(y_pred_lass, train_Y, coef, lambda_) lasso_train.append(lasso_val) y_pred_lass = lasso_reg.predict(test_X) lasso_val = get_lasso(y_pred_lass, test_Y, coef, lambda_) lasso_test.append(lasso_val)
print(cf[0], cf[1])""" predict_away = lm_away.predict(X_away_test) print( np.mean( cross_val_score(lm_away, X_away_train, y_away_train, scoring='neg_mean_absolute_error', cv=3))) # Lasso Regression print("\nLasso") print("Home") lm_lasso_home = Lasso(alpha=0.1) lm_lasso_home.fit(X_home_train, y_home_train) print( np.mean( cross_val_score(lm_lasso_home, X_home_train, y_home_train, scoring='neg_mean_absolute_error', cv=3))) print("\nAway") lm_lasso_away = Lasso(alpha=0.1) lm_lasso_away.fit(X_away_train, y_away_train) print( np.mean( cross_val_score(lm_lasso_away, X_away_train,
ridge_reg.predict(x) # using Stochastic Gradient Descent from sklearn.linear_model import SGDRegressor ridge_sgd = SGDRegressor( penalty="l2") ## indicates adding 1/2 * L2 norm of weight vector # 2. LASSO # Least Absolute Shrinkage and Selection Operator Regression # L1 norm of weight vector # it eliminates the weights of the least important features (sets them to zero) # it automatically performs feature selection and outputs a sparse model from sklearn.linear_model import Lasso lasso_reg = Lasso(alpha=0.1) lasso_reg.fit(x, y) lasso_sgd = SGDRegressor(penalty="l1") # 3. Elastic Net # the regularization term is a mixture of Lasso and Ridge regularization terms # the mixture parameter is r # r = 0 => Ridge # r = 1 => Lasso ## General Guideline: # never use linear regression alone # Ridge is a good starting point with slight regularization # If only a handful of features are useful, use Lasso or Elastic Net # # Elastic net is preferred when multicollinearity or where P > n in the training set
# ============================================================================= # LASSO Regression # Fit LASSO Regression Model over a range of different alphas and plot cv-R2 lasso_alpha_space = np.logspace(-4, 0, 50) lasso_scores = [] lasso_scores_std = [] lasso = Lasso() for alpha in lasso_alpha_space: lasso.alpha = alpha lasso_cv_scores = cross_val_score(lasso, X, y, cv=10) lasso_scores.append(np.mean(lasso_cv_scores)) lasso_scores_std.append(np.std(lasso_cv_scores)) display_plot(lasso_scores, lasso_scores_std) lasso.fit(X_train, y_train).coef_ lasso_y_train_pred = lasso.predict(X_train) lasso_y_test_pred = lasso.predict(X_test) lasso.score(X_test, y_test) # Plot residual vs. predicted values to diagnose the regression model plt.scatter(lasso_y_train_pred, lasso_y_train_pred - y_train, c='steelblue', marker='o', edgecolor='white', label='Training data') plt.scatter(lasso_y_test_pred, lasso_y_test_pred - y_test, c='limegreen', marker='s', edgecolor='white', label='Test data') plt.xlabel('Predicted values') plt.ylabel('Residuals') plt.legend(loc='upper left') plt.suptitle('LASSO Regression Diagnostic')
reg = Lasso(alpha=1) #reg = LinearRegression() change_flag = np.zeros(len(vals)) for i in range(windowLength, len(vals)): y = vals[i - windowLength:i].reshape(-1, 1) X = np.array(list(range(len(y)))).reshape(-1, 1) y_train = y[:-1] X_train = X[:-1] y_test = y[-1] X_test = X[-1, :].reshape(1, -1) reg = reg.fit(X_train, y_train) reg.coef_ reg.intercept_ y_hat = reg.predict(X_test) df100.iloc[i, -1] = y_hat res_train = y_train - reg.predict(X_train).reshape(-1, 1) res_test = y_test - y_hat flagIdx = res_test > 2 * np.std(y_train) change_flag[i] = res_test > 3 * np.std(res_train) # plt.plot(X ,y, 'r-',X_test, y_hat, '*',linewidth=2)
print("Mean score: %10.5f" % (mean_score / float(num_of_run))) print("Mean RMSE train: %10.5f" % (mean_rms_train / float(num_of_run))) print("Mean MAE train: %10.5f" % (mean_mae_train / float(num_of_run))) print("Mean MaxAE train: %10.5f" % (mean_maxae_train / float(num_of_run))) print("Mean rP train: %10.5f" % (mean_rp_train / float(num_of_run))) print("Mean score train: %10.5f" % (mean_score_train / float(num_of_run))) X_train, X_test, y_train, y_test = train_test_split( \ features_array, labels, test_size=util.LEAVEPERC) for a in [0.001, 0.01, 0.1, 1.0]: regressor = Lasso(alpha=a, max_iter=10e5) regressor.fit(X_train, y_train) train_score = regressor.score(X_train, y_train) test_score = regressor.score(X_test, y_test) coeff_used = np.sum(regressor.coef_ != 0) print("Lasso using alpha %10.5f " % (a)) print(" score train %10.5f " % (train_score)) print(" score test %10.5f " % (test_score)) print(" number of features used ", coeff_used) for cidx in range(len(regressor.coef_)): if regressor.coef_[cidx] != 0.0: print(" ", cidx + 1, " => ", featuresselected[cidx])