def cv_train_lasso_lars_with_sparse_refit(x_train, y_train, pval_cutoff=0.001, do_sparse_refit=True): model = LassoLarsCV(n_jobs=-1, cv=min(x_train.shape[0], 10)) model.fit(x_train, y_train) best_alpha_idx = int(np.argwhere(model.alpha_ == model.cv_alphas_)) if do_sparse_refit: sparse_alpha_idx = -1 for i in range(best_alpha_idx + 1, len(model.cv_alphas_)): pval = ttest_ind(model.mse_path_[best_alpha_idx], model.mse_path_[i]).pvalue if pval < pval_cutoff: sparse_alpha_idx = i - 1 break if sparse_alpha_idx == -1: # take the sparsest solution sparse_alpha_idx = len(model.cv_alphas_) - 1 model_sparse = LassoLars(alpha=model.cv_alphas_[sparse_alpha_idx]) model_sparse.fit(x_train, y_train) return model_sparse else: return model
def Lasso_fit(alpha, x, y): solver = LassoLars(alpha=alpha, fit_intercept=False, max_iter=3000) solver.alpha = alpha solver.fit(x, y) idxs = solver.coef_ != 0. c_cal = sum(idxs) return idxs, c_cal
def RunLARSScikit(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) inputData = np.genfromtxt(self.dataset[0], delimiter=',') responsesData = np.genfromtxt(self.dataset[1], delimiter=',') lambda1 = re.search("-l (\d+)", options) lambda1 = 1.0 if not lambda1 else float(lambda1.group(1)) max_iter1 = re.search("--max_iter (\d+)", options) max_iter1 = 500 if not max_iter1 else int(max_iter1.group(1)) eps1 = re.search("--eps (\d+)", options) eps1 = np.finfo(float).eps if not eps1 else float(eps1.group(1)) try: with totalTimer: # Perform LARS. model = LassoLars(alpha=lambda1, max_iter=max_iter1, eps=eps1) model.fit(inputData, responsesData) out = model.coef_ except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def LassoLars_score(X,y,**l1_parameters): """ Score predictor based on `scikit-learn`_ LassoLars regression. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **l1_parameters: Named parameters for sklearn Lasso regression Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the sklearn LassoLars regressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = LassoLars_score(tfs,tg, alpha=0.01) >>> scores array([0.12179406, 0.92205553, 0.15503451]) """ regressor = LassoLars(**l1_parameters) regressor.fit(X, y) scores = np.abs(regressor.coef_) return(scores)
def RunLARSScikit(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) inputData = np.genfromtxt(self.dataset[0], delimiter=',') responsesData = np.genfromtxt(self.dataset[1], delimiter=',') try: with totalTimer: # Get all the parameters. lambda1 = re.search("-l (\d+)", options) lambda1 = 0.0 if not lambda1 else int(lambda1.group(1)) # Perform LARS. model = LassoLars(alpha=lambda1) model.fit(inputData, responsesData) out = model.coef_ except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def lasso_subproblem(self, Xt): ''' function which performs: - 4: Sparse coding with LARS INPUTS: - self - Xt, data array - A, matrix - B, matrix - t, iter number OUTPUT: - coef ''' print "inside lasso" # 4: Sparse coding with LARS from sklearn.linear_model import LassoLars lars = LassoLars(alpha=self.alpha, verbose=False) # self.components = np.matrix([[8,2,3,4],[1,6,1,99]]) # Xt = np.matrix([[3,1],[6,7]]) # Xt[1,1] = 9999 lars.fit(self.components, Xt) coef = lars.coef_ # print coef coef = (np.asmatrix(coef)).T # Dimension control if self.verbose > 20: print "coef shape :", coef.shape return coef
class LassoLarsPrim(primitive): def __init__(self, random_state=0): super(LassoLarsPrim, self).__init__(name='LassoLars') self.hyperparams = [] self.type = 'Regressor' self.description = "LassoLars is a lasso model implemented using the LARS algorithm, and unlike the implementation based on coordinate descent, this yields the exact solution, which is piecewise linear as a function of the norm of its coefficients." self.hyperparams_run = {'default': True} self.random_state = random_state self.model = LassoLars(alpha=0.1) self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.model.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) output['predictions'] = self.model.predict(output['X']) output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"]) final_output = {0: output} return final_output
def OnceTest(dataMat, labelMat): clf1 = LinearRegression() clf1.fit(dataMat[0:99], labelMat[0:99]) labelTest1 = clf1.predict(dataMat[100:199]) print('default LinearRegression', ((labelTest1 - labelMat[100:199])**2).sum()) clf2 = Ridge(alpha=1, max_iter=100, tol=0.001) clf2.fit(dataMat[0:99], labelMat[0:99]) labelTest2 = clf2.predict(dataMat[100:199]) print('Ridge alhpa=1 max_iter=100 tol=0.001', ((labelTest2 - labelMat[100:199])**2).sum()) clf3 = Lasso(alpha=1, max_iter=100, tol=0.001) clf3.fit(dataMat[0:99], labelMat[0:99]) labelTest3 = clf3.predict(dataMat[100:199]) print('Lasso alhpa=1 max_iter=100 tol=0.001', ((labelTest3 - labelMat[100:199])**2).sum()) clf4 = ElasticNet(alpha=1, l1_ratio=0.5, max_iter=100, tol=1e-4) clf4.fit(dataMat[0:99], labelMat[0:99]) labelTest4 = clf4.predict(dataMat[100:199]) print('ElasticNet alhpa=1 max_iter=100 tol=0.001', ((labelTest4 - labelMat[100:199])**2).sum()) clf5 = LassoLars(alpha=1, max_iter=100) clf5.fit(dataMat[0:99], labelMat[0:99]) labelTest5 = clf4.predict(dataMat[100:199]) print('LassoLars alhpa=1 max_iter=100', ((labelTest5 - labelMat[100:199])**2).sum())
def RunLARSScikit(): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) inputData = np.genfromtxt(self.dataset[0], delimiter=',') responsesData = np.genfromtxt(self.dataset[1], delimiter=',') opts = {} if "lambda1" in options: opts["alpha"] = float(options.pop("lambda1")) if "max_iterations" in options: opts["max_iter"] = int(options.pop("max_iterations")) if "epsilon" in options: opts["eps"] = float(options.pop("epsilon")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: # Perform LARS. model = LassoLars(**opts) model.fit(inputData, responsesData) out = model.coef_ except Exception as e: return -1 return totalTimer.ElapsedTime()
def LassoRegression(X_train, X_test, y_train, y_test): regr = LassoLars(alpha=0.1) print len(X_train.values.tolist()[0]) print len(X_train.values.tolist()) regr.fit(X_train.values.tolist(), y_train.values.tolist()) predictions = regr.predict(X_test) return predictions
def dataPreprocess(): """ Description:使用最小角回归Lasso算法进行特征压缩 Params: Return: Author: HY Modify: 2019/6/21 16:37 """ inputFile = 'data/data1.csv' outputFile = 'tmp/newData.csv' data = pd.read_csv(inputFile) model=LassoLars(alpha=4,max_iter=1000) model.fit(data.iloc[:,0:13],data['y']) coefs=model.coef_ print(coefs) # model = Lasso(alpha=1.0,max_iter=1000000,tol=0.00000001) # model.fit(data.iloc[:, 0:13], data['y']) # coefs=model.coef_ # print(coefs) newColumns=[] for index,column in enumerate(data.columns[0:13]): if coefs[index]!=0: newColumns.append(column) newColumns.append(data.columns[13]) newData=pd.DataFrame(data[newColumns])#用Copy()是为了避免出现链式问题 newData['year']=list(range(1994,2014,1)) newData.to_csv(outputFile,index=False)
def explain_node(self, node_idx, x, edge_index, **kwargs): probas = self.__init_predict__(x, edge_index, **kwargs) x, probas, _, _, _, _ = self.__subgraph__(node_idx, x, probas, edge_index, **kwargs) x = x.detach().cpu().numpy() # (n, d) y = probas.detach().cpu().numpy() # (n, classes) n, d = x.shape K = self.__compute_kernel__(x, reduce=False) # (n, n, d) L = self.__compute_kernel__(y, reduce=True) # (n, n, 1) K_bar = self.__compute_gram_matrix__(K) # (n, n, d) L_bar = self.__compute_gram_matrix__(L) # (n, n, 1) K_bar = K_bar.reshape(n**2, d) # (n ** 2, d) L_bar = L_bar.reshape(n**2, ) # (n ** 2,) solver = LassoLars(self.rho, fit_intercept=False, normalize=False, positive=True) solver.fit(K_bar * n, L_bar * n) return solver.coef_
def lasso_lars(X_tr, y_tr, X_v, y_v, X_te, y_te, **kwargs): ''' This function runs the lasso lars model on train, validate, and test data with the option to include key word arguments ''' # create lasso lars model lars = LassoLars(**kwargs) # fit the model to train data lars.fit(X_tr, y_tr) # fit the model to train data lars_pred = lars.predict(X_tr) # calculate the rmse on the train data lars_rmse = sqrt(mean_squared_error(y_tr, lars_pred)) # predict the popularity on the validate data lars_pred_v = lars.predict(X_v) # calculate the rmse on the validate data lars_rmse_v = sqrt(mean_squared_error(y_v, lars_pred_v)) # predict the popularity on the test data lars_pred_t = lars.predict(X_te) # calculate the rmse on the test data lars_rmse_t = sqrt(mean_squared_error(y_te, lars_pred_t)) # print the train rmse print('RMSE for LASSO + LARS \n') print('On train data:\n', round(lars_rmse, 6), '\n') return lars_rmse, lars_rmse_v, lars_rmse_t
def online_dict_learning(X, lmda, D_0, T, k_cluster, eps, _NF=200): ''' algo 1 in the paper D_0: R^(m * k) X: R^(n * m) ''' n_dim, m_dim = X.shape A_t = np.zeros((k_cluster, k_cluster)) B_t = np.zeros((m_dim, k_cluster)) D_t = D_0 t_start = time.time() # print(lmda, _NF, eps) for t in range(T): # t_start_online = time.time() sample_idx = np.random.randint(0, n_dim) x_sample = X[sample_idx, :] lars_lasso = LassoLars(alpha=lmda) lars_lasso.fit(D_t, x_sample) alpha_t = lars_lasso.coef_ A_t += np.matmul(alpha_t.reshape(k_cluster, 1), alpha_t.reshape(1, k_cluster)) B_t += np.matmul(x_sample.reshape(m_dim, 1), alpha_t.reshape(1, k_cluster)) D_t = dict_update(D_t, A_t, B_t, eps=eps, _NF=_NF) # print('===== Iteration in online dictionary learning cost {:.04f}s'.format(time.time() - t_start_online)) print('Dcitionary update done! Time elapse {:.04f}s'.format(time.time() - t_start)) return D_t
def LARS_EN(Y, X, reg_param, reg_param1): ''' function takes - Y: p x 1 target variable - X: n x p dataset - reg_param: regularization parameter for l2-norm - reg_param1: regularization parameter for l1-norm function returns - beta: 1 x p vector with coefficients ''' # Find the number of features p = X.shape[1] # Create the artificial dataset for the naïve elastic net X = np.power(1 + reg_param, -0.5) * np.vstack( (X, np.sqrt(reg_param) * np.identity(p))) Y = np.vstack((Y, np.zeros(shape=(p, 1)))) gamma = reg_param1 / np.sqrt(1 + reg_param) # Center X X = StandardScaler(with_std=False).fit_transform(X) # Use the LARS (Efron 2004) algorithm to solve this lasso regression lasso = LassoLars(alpha=gamma, fit_intercept=False, max_iter=1000) lasso.fit(X, Y) # Transform the found coefficients in the elastic net coefficients beta = lasso.coef_ / np.sqrt(1 + reg_param) return beta
def predict_LarsLasso(X, y, train, test, alpha=0.1): # Fit lars = LassoLars(alpha) lars.fit(X.iloc[train], y.iloc[train]) # Predict prediction = lars.predict(X.iloc[test]) return prediction
def Lasso(x_train, y_train, x_test, y_test): estimator = LassoLars() estimator.fit(x_train, y_train) y_pred = estimator.predict(x_test) mse_score = mse(y_test, y_pred) print("mse_score: " + str(mse_score)) r2_score = r2(y_test, y_pred) print("r2_score: " + str(r2_score))
def select(self,X,y,weight,alpha=0.01): lars = LassoLars(normalize=False,alpha=alpha) lars.fit(X,y) path_idx = np.argwhere((lars.coef_path_ != 0).sum(axis=0) <= self.n_features)[-1,0] coef = lars.coef_path_[:,path_idx] f_indices = np.argwhere(coef != 0).T[0] if len(f_indices) == 0: f_indices = self.select(X,y,alpha=alpha * 0.01) return f_indices
def LassoLarsTest(dataMat, labelMat): clf1 = LassoLars(alpha=1, max_iter=100) clf1.fit(dataMat[0:99], labelMat[0:99]) labelTest1 = clf1.predict(dataMat[100:199]) print('LassoLars ', ((labelTest1 - labelMat[100:199])**2).sum()) clf2 = LassoLarsCV(max_n_alphas=10, max_iter=100) clf2.fit(dataMat[0:99], labelMat[0:99]) labelTest2 = clf2.predict(dataMat[100:199]) print('LassoLarsCV', ((labelTest2 - labelMat[100:199])**2).sum())
def scaledlasso(self, X, y, intercept, lam0=None, sigma=None): n, p = X.shape if lam0 == None: if p > pow(10, 6): lam0 = 'univ' else: lam0 = 'quantile' if lam0 == 'univ' or lam0 == 'universal': lam0 = np.sqrt(2 * np.log10(p) / n) if lam0 == 'quantile': L = 0.1 Lold = 0 while (np.abs(L - Lold) > 0.001): k = (L**4 + 2 * L**2) Lold = L L = -norm.ppf(np.min(k/p,0.99)) L = (L + Lold) / 2 if (p == 1): L = 0.5 lam0 = np.sqrt(2 / n) * L sigmaint = 0.1 sigmanew = 5 flag = 0 objlasso = LassoLars(fit_intercept=False,eps=0.001,fit_path=True) objlasso.fit(X,y) while abs(sigmaint - sigmanew) > 0.0001 and flag <= 100: flag = flag + 1 sigmaint = np.copy(sigmanew) lam = lam0 * sigmaint s = lam * n lams = objlasso.alphas_ s[np.where(s>np.max(lams))[0]]=np.max(lams) s[np.where(s<0)[0]]=0 sfrac = (s-s[0])/(s[p-1]-s[0]) s = (s-s[0])/(s[p-1]-s[0]) hbeta = objlasso.coef_ hy = np.dot(X,hbeta) sigmanew = np.sqrt(np.mean(np.square(y - hy))) sigmahat = sigmanew hlam = lam if sigma == None: sigmahat = np.sqrt(np.sum(np.square(y - hy)) / (n - np.sum(hbeta != 0))) return hbeta, sigmahat
class in_lassoLars(regression): def trainAlgo(self): self.model = LassoLars(alpha=self.param['alpha'], normalize=self.param['normalize'], fit_intercept=self.param['fit_intercept'], max_iter=self.param['max_iter'], positive=self.param['positive']) self.model.fit(self.inputData['X'], self.outputData['Y']) def predictAlgo(self): self.result['Y'] = self.model.predict(self.inputData['X'])
def metric(self): totalTimer = Timer() with totalTimer: model = LassoLars(**self.build_opts) model.fit(self.data[0], self.data[1]) out = model.coef_ metric = {} metric["runtime"] = totalTimer.ElapsedTime() return metric
def adaptiveLasso(): ''' Adaptive-Lasso变量选择模型 :return: ''' inputfile = 'data/data1.csv' data = pd.read_csv(inputfile) # 导入AdaptiveLasso算法,要在较新的Scikit-Learn才有 from sklearn.linear_model import LassoLars model = LassoLars() model.fit(data.iloc[:, 0:13], data['y']) print(model.coef_)
def lasso_lars(x_scaleddf, target): ''' runs Lasso Lars algorithm ''' # Make a model lars = LassoLars(alpha=1) # Fit a model lars.fit(x_scaleddf, target) # Make Predictions lars_pred = lars.predict(x_scaleddf) # Computer root mean squared error lars_rmse = sqrt(mean_squared_error(target, lars_pred)) return lars_rmse
def lasso_lars_test(x_scaleddf, target, X_test, y_test): ''' runs Lasso Lars algorithm ''' # Make a model lars = LassoLars(alpha=1) # Fit a model lars.fit(x_scaleddf, target) # Make Predictions lars_pred = lars.predict(X_test) # calculate MAE lars_MAE = mean_absolute_error(y_test, lars_pred) return lars_MAE, lars, lars_pred
def fit_model_11(self,toWrite=False): model = LassoLars(alpha=1,max_iter=5000) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 11 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model11/model.pkl','w') pickle.dump(model,f2) f2.close()
class _LassoLarsImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def _lassolars(*, train, test, x_predict=None, metrics, alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.220446049250313e-16, copy_X=True, fit_path=True, positive=False, jitter=None, random_state=None): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html#sklearn.linear_model.LassoLars """ model = LassoLars(alpha=alpha, fit_intercept=fit_intercept, verbose=verbose, normalize=normalize, precompute=precompute, max_iter=max_iter, eps=eps, copy_X=copy_X, fit_path=fit_path, positive=positive, jitter=jitter, random_state=random_state) model.fit(train[0], train[1]) model_name = 'LassoLars' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def lasso_subproblem(self, Xt, comp): print "inside lasso" # 4: Sparse coding with LARS lars = LassoLars(alpha=self.alpha, verbose=False) lars.fit(comp, Xt) coef = lars.coef_ # print coef coef = (np.asmatrix(coef)).T # Dimension control if self.verbose > 20: print "coef shape :", coef.shape return coef
def my_online_dict_learning(X, lmda, D_0, T, k_cluster, t_lower_bound, eps, _NF=200): ''' algo 1 in the paper D_0: R^(m * k) X: R^(n * m) ''' n_dim, m_dim = X.shape A_t = np.zeros((k_cluster, k_cluster)) B_t = np.zeros((m_dim, k_cluster)) D_t = D_0 t_end = time.time() t_start = time.time() t_cur = 0 error_list_omf = [] # print(lmda, _NF, eps) while t_end - t_start < t_lower_bound: for t in range(T): # t_start_online = time.time() error_t = eval_g_hat_with_DnX(X, D_t.T, n_dim, m_dim) error_list_omf.append((t_cur, error_t)) t1 = time.time() sample_idx = np.random.randint(0, n_dim) x_sample = X[sample_idx, :] lars_lasso = LassoLars(alpha=lmda) lars_lasso.fit(D_t, x_sample) alpha_t = lars_lasso.coef_ A_t += np.matmul(alpha_t.reshape(k_cluster, 1), alpha_t.reshape(1, k_cluster)) B_t += np.matmul(x_sample.reshape(m_dim, 1), alpha_t.reshape(1, k_cluster)) D_t = dict_update(D_t, A_t, B_t, eps=eps, _NF=_NF) t2 = time.time() t_cur += (t2 - t1) # print('===== Iteration in online dictionary learning cost {:.04f}s'.format(time.time() - t_start_online)) t_end = t_start + t_cur # print('Dcitionary update done! Time elapse {:.04f}s'.format(time.time() - t_start)) return D_t, error_list_omf
def linear_regressor(x, target, causes): """ Regression and prediction using a lasso :param x: data :param target: target - effect :param causes: causes of the causal mechanism :return: regenerated data with the fitted model """ if len(causes) == 0: x = np.random.normal(size=(target.shape[0], 1)) lasso = LassoLars(alpha=1.) # no regularization lasso.fit(x, target) return lasso.predict(x)
def lassolarsdimension(data, label): lassolarscv = LassoLarsCV(cv=5, max_iter=400).fit(data, label) lassolars = LassoLars(alpha=lassolarscv.alpha_) #生成LassoLars对象 x_lassolars = lassolars.fit(data, label) mask = x_lassolars.coef_ != 0 new_data = data[:, mask] return new_data, mask
def get_clustering_assignment_2(X, D_centroids, k_cluster, lmda, numIter=1000): n_dim, m_dim = X.shape centrioid_mat = np.reshape(D_centroids, (m_dim, k_cluster)) weight_mat = np.zeros((n_dim, k_cluster)) for idx in range(n_dim): lars_lasso = LassoLars(alpha=0, max_iter=500) lars_lasso.fit(centrioid_mat, X[idx, :]) alpha_t = lars_lasso.coef_ weight_mat[idx, :] = alpha_t kmeans = KMeans(n_clusters=k_cluster, max_iter=numIter) kmeans.fit(weight_mat) assignment = kmeans.labels_ return assignment
def Lars_Lasso(kf,data,label,k): val=0 for train, test in kf: X_train, X_test, y_train, y_test = data[train,:], data[test,:], label[train], label[test] log = LassoLars(alpha=.1) logit = log.fit(X_train,y_train) y_pred = logit.predict(X_test) val+= metrics.mean_squared_error(y_test, y_pred) return val/3
def lasso_sklearn(dict, target, gamma): """ Computes Lasso optimization :param dict: dictionnary :type dict: np.array :param target: image :type target: np.array :param gamma: regularization factor :type gamma: float :rtype: np.array """ num_samples = target.shape[1] patch_size = dict.shape[0] dic_size = dict.shape[1] gamma /= num_samples ll = LassoLars(alpha=gamma, fit_intercept=False, normalize=False, fit_path=False) ll.fit(dict, target) alpha = ll.coef_ alpha = alpha.reshape(dic_size, num_samples) return alpha
# LassoLars Regression import numpy as np from sklearn import datasets from sklearn.linear_model import LassoLars # load the iris datasets dataset = datasets.load_diabetes() # fit a LASSO using LARS model to the data model = LassoLars(alpha=0.1) model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted-expected)**2) print(mse) print(model.score(dataset.data, dataset.target))
def ProcessData(df,vect1,vect2,builder): descriptionmatrix = vect1.transform([str(x) for x in df['titledescription'].values]) locationmatrix = vect2.transform([str(x) for x in df['locationfull'].values]) # x = build_design_matrices([builder], df, return_type='dataframe', NA_action=NAAction(on_NA='drop', NA_types=[])) y = df['SalaryNormalized'].values #x_combo = np.hstack([np.asarray(x[0]),descriptionmatrix.toarray(),locationmatrix.toarray()]) x_combo = np.hstack([descriptionmatrix.toarray(),locationmatrix.toarray()]) return (np.asarray(y), sparse.coo_matrix(x_combo)) train = PreProcess(pd.read_csv('train.csv')) (vect1,vect2,builder) = InitializeTransformers(train) (y, x) = ProcessData(train, vect1, vect2,builder) (y_test, x_test) = ProcessData(PreProcess(pd.read_csv('solution.csv')),vect1,vect2,builder) lasso = Lasso() lasso.fit(x,y) y_pred = lasso.predict(x_test) lassolars = LassoLars(alpha=2) lassolars.fit(x.toarray(),y) lars_pred = lassolars.predict(x_test) print np.sqrt(mean_squared_error(y_test, y_pred)) print r2_score(y_test,y_pred) print np.sqrt(mean_squared_error(y_test,lars_pred)) print r2_score(y_test,lars_pred)