def create_model_LARS(state_matrix, transcription_factors): regulators = {} for i in range(len(transcription_factors)): #Declaration for training set for the Target Gene X = [] y = [] for j in range(1, len(state_matrix)): X.append(state_matrix[j - 1].tolist()) y.append(state_matrix[j][i] - state_matrix[j - 1][i]) #Initialise the LARS Model lars = Lars() #Fit the training data into the Model lars.fit(X, y) #Extract the important features corresponding to a particular gene coefficients = lars.coef_ #Add to the dictionary regulators[transcription_factors[i]] = coefficients return regulators
def fit(self, X, y): assert not y is None, f'y:{y}' k = X.shape[1] self.k_ = k if self.max_k is None: if self.k_share is None: self.max_k = 500 else: self.max_k = int(k * self.k_share) if self.selector is None: self.selector = 'Lars' if self.selector == 'Lars': selector = Lars(fit_intercept=1, normalize=1, n_nonzero_coefs=self.max_k) elif self.selector == 'elastic-net': selector = ElasticNet(fit_intercept=True, selection='random', tol=0.001, max_iter=5000, warm_start=1, random_state=0) else: selector = self.selector selector.fit(X, y) self.col_select_ = np.arange(k)[np.abs(selector.coef_) > 0.0001] if self.col_select_.size < 1: self.col_select_ = np.arange(1) return self
def LarsRegressorGS(X_train, X_test, y_train, y_test): reg = Lars() grid_values = { 'n_nonzero_coefs': list(range(100, 500, 100)), } grid_reg = GridSearchCV( reg, param_grid=grid_values, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'], refit='r2', n_jobs=-1, cv=2, verbose=100) grid_reg.fit(X_train, y_train) reg = grid_reg.best_estimator_ reg.fit(X_train, y_train) y_pred = reg.predict(X_test) printMetrics(y_true=y_test, y_pred=y_pred) val_metrics = getMetrics(y_true=y_test, y_pred=y_pred) y_pred = reg.predict(X=X_train) metrics = getMetrics(y_true=y_train, y_pred=y_pred) printMetrics(y_true=y_train, y_pred=y_pred) best_params: dict = grid_reg.best_params_ saveBestParams(nameOfModel="LarsRegressorGS", best_params=best_params) logSave(nameOfModel="LarsRegressorGS", reg=reg, metrics=metrics, val_metrics=val_metrics)
def runLarsRegressor(self): lm = Lars(fit_intercept=True, normalize=True) print("Lars Regressor\n") lm.fit(self.m_X_train, self.m_y_train) predictY = lm.predict(self.m_X_test) score = lm.score(self.m_X_test, self.m_y_test) predictTraingY = lm.predict(self.m_X_train) self.displayPredictPlot(predictY) self.displayResidualPlot(predictY, predictTraingY) self.dispalyModelResult(lm, predictY, score)
def run(self, X, y=None): """ Fits filter Parameters ---------- X : numpy array, shape (n_samples, n_features) The training input samples. y : numpy array, optional The target values (ignored). Returns ---------- W : array-like, shape (n_features, k) Feature weight matrix. See Also -------- examples -------- from ITMO_FS.filters.sparse import MCFS from sklearn.datasets import make_classification import numpy as np dataset = make_classification(n_samples=100, n_features=20, n_informative=4, n_redundant=0, shuffle=False) data, target = np.array(dataset[0]), np.array(dataset[1]) model = MCFS(d=5, k=2, scheme='heat') weights = model.run(data, target) print(model.feature_ranking(weights)) """ n_samples, n_features = X.shape graph = NearestNeighbors(n_neighbors=self.p + 1, algorithm='ball_tree').fit(X).kneighbors_graph(X).toarray() graph = graph + graph.T indices = [[(i, j) for j in range(n_samples)] for i in range(n_samples)] func = np.vectorize(lambda xy: graph[xy[0]][xy[1]] * self.scheme(X[xy[0]], X[xy[1]]), signature='(1)->()') W = func(indices) D = np.diag(W.sum(axis=0)) L = D - W eigvals, Y = eigh(type=1, a=L, b=D, eigvals=(0, self.k - 1)) weights = np.zeros((n_features, self.k)) for i in range(self.k): clf = Lars(n_nonzero_coefs=self.d) clf.fit(X, Y[:, i]) weights[:, i] = clf.coef_ return weights
class _LarsImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def LarsRegressor(X_train, X_test, y_train, y_test): reg = Lars() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) printMetrics(y_true=y_test, y_pred=y_pred) val_metrics = getMetrics(y_true=y_test, y_pred=y_pred) y_pred = reg.predict(X=X_train) metrics = getMetrics(y_true=y_train, y_pred=y_pred) printMetrics(y_true=y_train, y_pred=y_pred) logSave(nameOfModel="LarsRegressor", reg=reg, metrics=metrics, val_metrics=val_metrics)
def perform_LARS(normalized_matrix,genes): #Number of Genes no_genes = len(genes) #Dictionary for top regulators for each gene regulators = {} for i in range(0,no_genes): #Current Gene for which the Top Regulators are being found current_y = normalized_matrix[:,i] #Create a copy of the matrix temp_matrix = normalized_matrix.copy() #Remove the current feature temp_matrix = np.delete(temp_matrix,i,axis=1) #Computation of the coefficients after training with Least Angle Regression Method coefficients = Lars() #Fit the Model coefficients.fit(temp_matrix,current_y) #Coefficient values coeff_values = coefficients.coef_ #Copy the genes into a temporary list gene_copy = list(genes) #Remove the Gene to create the appropriate indexes gene_copy.remove(genes[i]) #Perform Stability Selection to get an effective rank of the top regulators rank_dict_score = stability_selection(temp_matrix,genes,2000,current_y,gene_copy) #Top Regulators top_regulators = find_top_regulators(rank_dict_score) #Append to regulators regulators[genes[i]] = top_regulators return regulators
def create_model_LARS(state_matrix, transcription_factors): regulators = {} for i in range(0, len(transcription_factors)): #Create the training set X = [] y = [] for j in range(1, len(state_matrix)): #Append the expression level of the previous step X.append(state_matrix[j - 1].tolist()) #The output value is the difference / rate of change of expression y.append(state_matrix[j][i] - state_matrix[j - 1][i]) #Copy the list of Transcription Factors tf_copy = list(transcription_factors) #Remove the current transcription factor tf_copy.remove(tf_copy[i]) #Remove the corresponding column from the training set [expression.remove(expression[i]) for expression in X] """ Feature Selection using Least Angle Regression """ #Initialise the model using Least Angle Regression lars = Lars() #Fit the training data into the Model lars.fit(X, y) #Extract the important features corresponding to a particular gene coefficients = lars.coef_ #Regulators for the Network regulators[transcription_factors[i]] = coefficients return regulators
def main(): from_root = "~/Documents/School/ComputerScience/ahcompsci/Scikit-Learning-StanleyWei/scikit-utkproject/dataset/fiftytwo" path = "dataset/whitemensmall/" dirs = os.listdir(path) main_df = add_images_from_dirs(dirs, path) train_images, test_images = train_test_split(main_df.loc[:, "image"], main_df.loc[:, "gender"]) # train_df = train_df.loc[train_df['ethnicity'] == "0"] # test_df = test_df.loc[test_df['ethnicity'] == "0"] train_x = flatten_image_df(train_images) test_x = flatten_image_df(test_images) clf = Lars() # train_x = np.array(train_df.loc[:, "image"]) # x_train = train_x.flatten().reshape(len(train_df), -1) clf.fit(train_x, train_df.loc[:, "age"].to_numpy()) coefficients = clf.coef_ # print(coefficients) coefficients_array = np.array(coefficients).reshape( len(train_df.image[0]), -1) # print(coefficients_array) # heatmap = plt.imshow(coefficients_array, cmap = "hot", interpolation = "nearest") coefficients_abs = coefficients for i in range(len(coefficients_abs)): coefficients_abs[i] = abs(coefficients_abs[i]) coefficients_array_abs = np.array(coefficients_abs).reshape( len(train_df.image[0]), -1) heatmap = plt.imshow(coefficients_array_abs, cmap="hot", interpolation="nearest") # heatmap_extremes = plt.imshow(coefficients_array_abs, vmax = 0.025, cmap = "hot", interpolation = "nearest") plt.colorbar(heatmap) # plt.colorbar(heatmap_extremes) plt.show()
# LARS Regression import numpy as np from sklearn import datasets from sklearn.linear_model import Lars # load the diabetes datasets dataset = datasets.load_diabetes() # fit a LARS model to the data model = Lars() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted-expected)**2) print(mse) print(model.score(dataset.data, dataset.target))
cdg = CDG.CollinearDataGenerator(p = 20,sparsity=.8) X = cdg.getX(n) p = X.shape[1] y = cdg.getY(X) print cdg.gamma val_size = int(0.1 * X.shape[0]) X_val = X[0:val_size,:] y_val = y[0:val_size,:] X_train = X[val_size:,:] y_train = y[val_size:,:] lars = Lars(n_nonzero_coefs=2) lars.fit(X,y) # print lars.coef_ alphas, order, coefs = lars_path(X,y.T[0],verbose=True) # print alphas print order magnitudes = sorted(list(enumerate(coefs[:,-1])),key=lambda x: x[1]) magnitudes = map(lambda x: x[0],magnitudes) print magnitudes # print coefs quantities = coefs[:,-1] quantities = np.array([quantities[i] for i in order]) # print quantities total = sum(abs(quantities)) # # print total cumsum = np.array(reduce(lambda a, x: a + [a[-1] + abs(x)], quantities[1:],[abs(quantities[0])]))
def larsLearn(kap): lars = Lars(n_nonzero_coefs=kap,fit_intercept=False) lars.fit(X_train,y_train) return lars
model = sm.OLS(housing['labels'], housing['data']) results = model.fit() print results.summary() # Part B preds_train = lin.predict(housing['data']) preds_test = lin.predict(housing['testdata']) ave_sq_loss_train = ((housing['labels'] - preds_train) ** 2).sum()/len(housing['data'][:,1]) ave_sq_loss_test = ((housing['testlabels'] - preds_test) ** 2).sum()/len(housing['testdata'][:,1]) print ave_sq_loss_train print ave_sq_loss_test # Part C housing['data'] = housing['data'][:,1:14] housing['testdata'] = housing['testdata'][:,1:14] from sklearn.linear_model import Lars reduced = Lars(fit_intercept = True, n_nonzero_coefs = 3) reduced.fit(housing['data'], housing['labels']) print reduced.intercept_ print reduced.coef_
for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['Elastic_pca'] = sumsum / float(result_row) rs_score['Elastic_pca'] = r2_score(y_test, y) ElasticModel = ElasticNetCV() ElasticModel.fit(X_train_std, y_train) y = ElasticModel.predict(X_test_std) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['Elastic_std'] = sumsum / float(result_row) rs_score['Elastic_std'] = r2_score(y_test, y) LarsModel = Lars() LarsModel.fit(X_train_pca, y_train) y = LarsModel.predict(X_test_pca) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['Lars_pca'] = sumsum / float(result_row) rs_score['Lars_pca'] = r2_score(y_test, y) LarsModel = Lars() LarsModel.fit(X_train_std, y_train) y = LarsModel.predict(X_test_std) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row):
def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', regularization=None, copy_cov=True, init=None, max_iter=1000): """Generic sparse coding Each column of the result is the solution to a Lasso problem. Parameters ---------- X: array of shape (n_samples, n_features) Data matrix. dictionary: array of shape (n_components, n_features) The dictionary matrix against which to solve the sparse coding of the data. Some of the algorithms assume normalized rows. gram: None | array, shape=(n_components, n_components) Precomputed Gram matrix, dictionary * dictionary' gram can be None if method is 'threshold'. cov: array, shape=(n_components, n_samples) Precomputed covariance, dictionary * X' algorithm: {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'} lars: uses the least angle regression method (linear_model.lars_path) lasso_lars: uses Lars to compute the Lasso solution lasso_cd: uses the coordinate descent method to compute the Lasso solution (linear_model.Lasso). lasso_lars will be faster if the estimated components are sparse. omp: uses orthogonal matching pursuit to estimate the sparse solution threshold: squashes to zero all coefficients less than regularization from the projection dictionary * data' regularization : int | float The regularization parameter. It corresponds to alpha when algorithm is 'lasso_lars', 'lasso_cd' or 'threshold'. Otherwise it corresponds to n_nonzero_coefs. init: array of shape (n_samples, n_components) Initialization value of the sparse code. Only used if `algorithm='lasso_cd'`. max_iter: int, 1000 by default Maximum number of iterations to perform if `algorithm='lasso_cd'`. copy_cov: boolean, optional Whether to copy the precomputed covariance matrix; if False, it may be overwritten. Returns ------- code: array of shape (n_components, n_features) The sparse codes See also -------- sklearn.linear_model.lars_path sklearn.linear_model.orthogonal_mp sklearn.linear_model.Lasso SparseCoder """ if X.ndim == 1: X = X[:, np.newaxis] n_samples, n_features = X.shape if cov is None and algorithm != 'lasso_cd': # overwriting cov is safe copy_cov = False cov = np.dot(dictionary, X.T) if algorithm == 'lasso_admm': alpha = float(regularization) / n_features # account for scaling try: err_mgt = np.seterr(all='ignore') code, dictionary = lasso_admm(X.T, dictionary.T, gamma=alpha, gram=gram, cov=cov, max_iter=max_iter) new_code = code.T finally: np.seterr(**err_mgt) elif algorithm == 'lasso_lars': alpha = float(regularization) / n_features # account for scaling try: err_mgt = np.seterr(all='ignore') lasso_lars = LassoLars(alpha=alpha, fit_intercept=False, verbose=False, normalize=False, precompute=gram, fit_path=False) lasso_lars.fit(dictionary.T, X.T, Xy=cov) new_code = lasso_lars.coef_ finally: np.seterr(**err_mgt) elif algorithm == 'lasso_cd': alpha = float(regularization) / n_features # account for scaling clf = Lasso(alpha=alpha, fit_intercept=False, precompute=gram, max_iter=max_iter, warm_start=True) clf.coef_ = init clf.fit(dictionary.T, X.T) new_code = clf.coef_ elif algorithm == 'lars': try: err_mgt = np.seterr(all='ignore') lars = Lars(fit_intercept=False, verbose=False, normalize=False, precompute=gram, n_nonzero_coefs=int(regularization), fit_path=False) lars.fit(dictionary.T, X.T, Xy=cov) new_code = lars.coef_ finally: np.seterr(**err_mgt) elif algorithm == 'threshold': new_code = ((np.sign(cov) * np.maximum(np.abs(cov) - regularization, 0)).T) elif algorithm == 'omp': new_code = orthogonal_mp_gram(gram, cov, regularization, None, row_norms(X, squared=True), copy_Xy=copy_cov).T else: raise ValueError('Sparse coding method must be "lasso_lars" ' '"lasso_cd", "lasso", "threshold" or "omp", got %s.' % algorithm) return new_code
print new_reg_data.shape #(200, 11) #Taking a more fundamental基本的 approach to regularization正则化 with LARS #Least-angle regression (LARS) is a regression technique that is well suited for #high-dimensional problems, that is, p >> n, where p denotes the columns or features #and n is the number of samples. from sklearn.datasets import make_regression reg_data, reg_target = make_regression(n_samples=200, n_features=500, n_informative=10, noise=2) from sklearn.linear_model import Lars lars = Lars(n_nonzero_coefs=10) lars.fit(reg_data, reg_target) print np.sum(lars.coef_ != 0) #10 train_n = 100 lars_12 = Lars(n_nonzero_coefs=12) lars_12.fit(reg_data[:train_n], reg_target[:train_n]) lars_500 = Lars() # it's 500 by default lars_500.fit(reg_data[:train_n], reg_target[:train_n]); #Now, to see how well each feature fit the unknown data, do the following: np.mean(np.power(reg_target[train_n:] - lars_12.predict(reg_data[train_n:]), 2)) #31.527714163321001 np.mean(np.power(reg_target[train_n:] - lars_500.predict(reg_data[train_n:]), 2)) #9.6198147535136237e+30 from sklearn.linear_model import LarsCV
print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds), '\n' # PCA + Elastic Net elasticnet = ElasticNet(l1_ratio=0.5) elasticnet.fit(reduced_training_features, training_labels) preds = elasticnet.predict(reduced_testing_features) score = elasticnet.score(reduced_testing_features,testing_labels) print 'PCA + ElasticNet Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds) # Least-Angle Regression (LARS) from sklearn.linear_model import Lars lars = Lars() lars.fit(training_features, training_labels) preds = lars.predict(testing_features) score = lars.score(testing_features,testing_labels) print 'LARS Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds), '\n' # PCA + LARS lars = Lars() lars.fit(reduced_training_features, training_labels) preds = lars.predict(reduced_testing_features) score = lars.score(reduced_testing_features,testing_labels) print 'PCA + LARS Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds)
# ('ppru', 'ppr_submission_user.csv', 'ppr_fitted_user.csv'), # ('pprg', 'ppr_submission_global.csv', 'ppr_fitted_global.csv'), ] fitted = pd.DataFrame(index=review_data.index) submission = pd.DataFrame(index=review_data_final.index) for name, sub_name, fit_name in blend_inputs: f_df = pd.read_csv(os.path.join('..', fit_name)) f_df.index = review_data.index fitted[name] = f_df['stars'] s_df = pd.read_csv(os.path.join('..', sub_name)) s_df.index = review_data_final.index submission[name] = s_df['stars'] gbr = GradientBoostingRegressor(max_depth=3,verbose=2) gbr.fit(fitted, review_data['stars']) pred = gbr.predict(submission) pd.DataFrame({'review_id' : submission.index, 'stars' : np.maximum(1, np.minimum(5, pred))}).to_csv('../gbr_submission.csv', index=False) lar = Lars(fit_intercept=True, verbose=2, normalize=True, fit_path=True) lar.fit(fitted, review_data['stars']) pred = lar.predict(submission) pd.DataFrame({'review_id' : submission.index, 'stars' : np.maximum(1, np.minimum(5, pred))}).to_csv('../lar_submission.csv', index=False) ridge = Ridge() ridge.fit(fitted, review_data['stars']) pred = ridge.predict(submission) pd.DataFrame({'review_id' : submission.index, 'stars' : np.maximum(1, np.minimum(5, pred))}).to_csv('../ridge_submission.csv', index=False) ## TODO: blend based on size of rating neighborhood
y_train = ml_outs.loc[train_index] x_test = ml.loc[test_index] y_test = ml_outs.loc[test_index] # Scale scaler = StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # Implemnent Model linreg = Lars() # Better linreg = LarsCV() # one Better linreg = LassoLarsCV() # Same linreg = LinearRegression() linreg.fit(x_train, y_train) predictions = linreg.predict(x_test) # Plot predictions and y_test plt.figure() plt.plot(predictions, label='Predictions') plt.plot(pd.Series(predictions).rolling(5).mean(), label='rolling predictions') plt.plot(y_test.values, label='Shifted Currencies ( y_test values', color='grey') plt.plot(cu.loc[test_index, currency].values, label='UNSHIFTED') plt.legend() plt.show() # Print Score and summary
def _fit(self, X, y): """ Fits the filter. Parameters ---------- X : array-like, shape (n_samples, n_features) The training input samples. y : array-like The target values (ignored). Returns ---------- None """ if self.scheme == '0-1': scheme = self.__scheme_01 elif self.scheme == 'heat': scheme = self.__scheme_heat elif self.scheme == 'dot': scheme = self.__scheme_dot else: getLogger(__name__).error( "scheme should be either '0-1', 'heat' or 'dot'; %s passed", self.scheme) raise KeyError( "scheme should be either '0-1', 'heat' or 'dot'; %s passed" % self.scheme) n_samples = X.shape[0] if self.k > n_samples: getLogger(__name__).error( "Cannot find %d clusters with n_samples = %d", self.k, n_samples) raise ValueError("Cannot find %d clusters with n_samples = %d" % (self.k, n_samples)) if self.p >= n_samples: getLogger(__name__).error( "Cannot select %d nearest neighbors with n_samples = %d", self.p, n_samples) raise ValueError( "Cannot select %d nearest neighbors with n_samples = %d" % (self.p, n_samples)) if self.full_graph: graph = np.ones((n_samples, n_samples)) else: graph = NearestNeighbors( n_neighbors=self.p, algorithm='ball_tree').fit(X).kneighbors_graph().toarray() graph = np.minimum(1, graph + graph.T) getLogger(__name__).info("Nearest neighbors graph: %s", graph) W = graph * pairwise_distances(X, metric=lambda x, y: scheme(x, y)) getLogger(__name__).info("W: %s", W) D = np.diag(W.sum(axis=0)) getLogger(__name__).info("D: %s", D) L = D - W getLogger(__name__).info("L: %s", L) eigvals, Y = eigh(type=1, a=L, b=D, subset_by_index=[1, self.k]) getLogger(__name__).info("Eigenvalues: %s, classes: %s", eigvals, Y) weights = np.zeros((self.n_features_, self.k)) for i in range(self.k): clf = Lars(n_nonzero_coefs=self.n_features) clf.fit(X, Y[:, i]) weights[:, i] = np.abs(clf.coef_) getLogger(__name__).info("Weights for eigenvalue %d: %s", i, weights[:, i]) self.feature_scores_ = weights.max(axis=1) getLogger(__name__).info("Feature scores: %s", self.feature_scores_) ranking = np.argsort(self.feature_scores_)[::-1] self.selected_features_ = ranking[:self.n_features]
def stability_selection(expression_matrix,genes,R,y,gene_copy): #Final Score for each of the transcription factors score = [] #Coefficients for each iteration coefficients = [] #Run the Selection Algorithm for R/2 times for i in range(0,R/2): #Indexes for Randomly splitting the data into equal halves indices = range(0,len(genes)-1) #Randomly Shuffle the indices random.shuffle(indices) #Split into two parts first_half = indices[:len(genes)/2] second_half = indices[len(genes)/2:] #First Half of the Expression Matrix extract_first_half = expression_matrix[:,first_half] #Second Half of the Expression Matrix extract_second_half = expression_matrix[:,second_half] #Randomly Perturb Data by multiplying the expression of candidate TF's with a number b/w (alpha,1), where alpha belongs to (0,1) alpha = 0.19 #Perturbation perturbation = random.uniform(alpha,1) #Multiply the expression matrix perturbed_first_half = extract_first_half * perturbation perturbed_second_half = extract_second_half * perturbation #Run LARS on each of them to get the score coeff = Lars() #Fit the First Half coeff.fit(perturbed_first_half,y) #Result for the first half of the split result_first_half = coeff.coef_ #Fit the second half coeff.fit(perturbed_second_half,y) #Result for the second half of the split result_second_half = coeff.coef_ temp_dict = {} #Creation of Singular Score Array for i in range(0,len(first_half)): temp_dict[first_half[i]] = result_first_half[i] for i in range(0,len(second_half)): temp_dict[second_half[i]] = result_second_half[i] #Append the values into the empty list coeff_list = [] for val in temp_dict.values(): coeff_list.append(val) #Append to main coeff list coefficients.append(coeff_list) #Ranks for Each Regulator Gene ranks = get_ranks(coefficients,gene_copy) return ranks
时间复杂度比较低 可以快速改造成lasso 缺点: 因为模型是对残差进行迭代设计,所以对噪声敏感 ''' rg = Lars(fit_intercept=True, verbose=False, normalize=True, precompute='auto', n_nonzero_coefs=500, eps=2.2204460492503131e-16, copy_X=True, fit_path=True, positive=False) rg.fit(X_train, Y_train) Y_pre = rg.predict(X_test) rg.score(X_test, Y_test) rg.coef_ rg.intercept_ ''' fit_intercept 是否训练截距 verbose 冗长度 normalize 归一化否 precompute 是否使用Gram矩阵来加速 n_nonzero_coefs 非零系数的目标数 eps 精确度,计算某个值时用到 copy_X 是否覆盖模型中的X fit_path 不太理解,暂时应该也用不到 positive 设置强制系数为正的嘛? '''
class LarsClass: """ Name : Lars Attribute : None Method : predict, predict_by_cv, save_model """ def __init__(self): # 알고리즘 이름 self._name = 'lars' # 기본 경로 self._f_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/regression/resource/regression_sample.csv", sep=",", encoding="utf-8") # 학습 및 테스트 데이터 분리 self._x = (data["year"] <= 2017) self._y = (data["year"] >= 2018) # 학습 데이터 분리 self._x_train, self._y_train = self.preprocessing(data[self._x]) # 테스트 데이터 분리 self._x_test, self._y_test = self.preprocessing(data[self._y]) # 모델 선언 self._model = Lars(normalize=False) # 모델 학습 self._model.fit(self._x_train, self._y_train) # 데이터 전처리 def preprocessing(self, data): # 학습 x = [] # 레이블 y = [] # 기준점(7일) base_interval = 7 # 기온 temps = list(data["temperature"]) for i in range(len(temps)): if i < base_interval: continue y.append(temps[i]) xa = [] for p in range(base_interval): d = i + p - base_interval xa.append(temps[d]) x.append(xa) return x, y # 일반 예측 def predict(self, save_img=False, show_chart=False): # 예측 y_pred = self._model.predict(self._x_test) # 스코어 정보 score = r2_score(self._y_test, y_pred) # 리포트 확인 if hasattr(self._model, 'coef_') and hasattr(self._model, 'intercept_'): print(f'Coef = {self._model.coef_}') print(f'intercept = {self._model.intercept_}') print(f'Score = {score}') # 이미지 저장 여부 if save_img: self.save_chart_image(y_pred, show_chart) # 예측 값 & 스코어 return [list(y_pred), score] # CV 예측(Cross Validation) def predict_by_cv(self): # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현 return False # GridSearchCV 예측 def predict_by_gs(self): pass # 모델 저장 및 갱신 def save_model(self, renew=False): # 모델 저장 if not renew: # 처음 저장 joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') else: # 기존 모델 대체 if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'): os.rename( self._f_path + f'/model/{self._name}_rg.pkl', self._f_path + f'/model/{str(self._name) + str(time.time())}_rg.pkl') joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') # 회귀 차트 저장 def save_chart_image(self, data, show_chart): # 사이즈 plt.figure(figsize=(15, 10), dpi=100) # 레이블 plt.plot(self._y_test, c='r') # 예측 값 plt.plot(data, c='b') # 이미지로 저장 plt.savefig('./chart_images/tenki-kion-lr.png') # 차트 확인(Optional) if show_chart: plt.show() def __del__(self): del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
def TIGRESS(X, y, nsplit=100, nstepsLARS=5, alpha=0.4, scoring="area"): """ TIGRESS score predictor based on stability selection. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions nsplit (int): number of splits applied, i.e., randomization tests, the highest the best nstepsLARS (int): number of steps of LARS algorithm, i.e., number of non zero coefficients to keep (Lars parameter) alpha: Noise multiplier coefficient, Each transcription factor expression is multiplied by a random variable $\in [\alpha,1]$ scoring (str): option used to score each possible link only "area" and "max" options are available Returns: numpy.array: co-regulation scores The i-th element of the score array represents the score assigned by the sklearn randomizedlasso stability selection to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = TIGRESS(tfs,tg) >>> scores array([349. , 312.875, 588.125]) """ n,p = X.shape halfsize = int(n/2) if nstepsLARS > p: nstepsLARS = p-1 freq = np.zeros((p, nstepsLARS)) i = 0 while i < nsplit: # Randomly reweight each variable (TF expression) random_perturbation = np.random.uniform(low=alpha, high=1.0, size=p) X *= random_perturbation # Randomly split the sample in two sets X_1,X_2,y_1,y_2 = train_test_split(X,y,test_size=halfsize, shuffle=True) for X_i,y_i in [[X_1, y_1],[X_2,y_2]]: if y_i.std() > 0: # run LARS on each subsample and collect variables are selected lars = Lars(normalize=False, n_nonzero_coefs=nstepsLARS) lars.fit(X_i,y_i) # collect the presence of the coefficients along the path path = lars.coef_path_ if path.shape[1] < nstepsLARS+1: path_add = np.tile(path[:,-1],(nstepsLARS+1 - path.shape[1],1)).T path = np.hstack((path,path_add)) freq += np.abs(np.sign(path[:,1:])) i += 1 X /= random_perturbation # normalize frequence in [0,1] to get stability curves freq /= 2*halfsize if (scoring=="area"): score = np.cumsum(freq,axis=1)/np.arange(1,nstepsLARS+1,1) if (scoring=="max"): score = np.maximum.accumulate(freq,axis=1) return(score[:,nstepsLARS - 1])
def main(): feature_vectors = [] movie_features = [] features_train = [] data = None with open("movie-data/ratings-train.csv") as f: data = f.readlines() data = data[1:] for val in data: line = val[:-1] line = line.split(",") line = [float(val) for val in line] feature_vectors.append(line) feature_vectors = np.array(feature_vectors) ratings_train = feature_vectors[:, 2] movie_ids = feature_vectors[:, 1] data = None with open("movie-data/movie-features.csv") as f: data = f.readlines() data = data[1:] for val in data: line = val[:-1] line = line.split(",") line = [float(val) for val in line] movie_features.append(line) movie_features = np.array(movie_features) error = [] error_lars = [] error_lasso = [] for i in range(671): person_features = feature_vectors[(feature_vectors[:,0] - 1) == i] for k in range(len(person_features[:, 2])): person_features[:,2][k] = (person_features[:,2][k] - np.mean(person_features[:, 2]))/(np.std(person_features[:, 2])) MOVIE_IDS = person_features[:, 1] features_train = movie_features[np.array(MOVIE_IDS, int) - 1] features_train[:, 0] = 1.0 for p in range(1, features_train.shape[1]): features_train[:, p] = (features_train[:, p] - np.mean(features_train[:, p]))/(np.std(features_train[:, p]) + 10**-8) lasso = Lasso(alpha = 0.01, normalize=True, fit_intercept = True) alphas, coeff, _ = lasso_path(features_train, person_features[:, 2], 5e-3, positive = True, fit_intercept = False) alphas = -np.log10(alphas) colors = cycle(['b', 'r', 'g', 'c', 'k']) plt.xlabel('alphas') plt.ylabel('coeff') k = 0 for val, c in zip(coeff, colors): print(k) k = k + 1 print(np.array(val).shape) print(np.array(alphas).shape) plt.plot(alphas, val, c=c) plt.show() pred = lasso.fit(features_train, person_features[:, 2]).predict(features_train) error_lasso.append(np.mean((pred - person_features[:, 2])**2)) clf = Ridge(alpha = 0.1, normalize=True, fit_intercept = True) reg = Lars(n_nonzero_coefs = 5) pred = clf.fit(features_train, person_features[:, 2]).predict(features_train) error.append(np.mean((pred - person_features[:, 2])**2)) pred = reg.fit(features_train, person_features[:, 2]).predict(features_train) #pred[pred < 0] = 0 error_lars.append(np.mean((pred - person_features[:, 2])**2)) print(np.mean((pred - person_features[:, 2])**2)) #print(features_train) #print(person_features) plt.figure(1) #plt.title('least angles regression') plt.xlabel('users') plt.ylabel('error') line_up, = plt.plot(error_lars, label='least angles regression') #plt.figure(2) #plt.xlabel('users') #plt.ylabel('error') #plt.title('ridge regression') line_down, = plt.plot(error, label = 'ridge regression') #plt.figure(3) #plt.xlabel('users') #plt.ylabel('error') #plt.title('lasso regression') line_hoz, = plt.plot(error_lasso, label = 'lasso regression') plt.legend(handles=[line_up, line_down, line_hoz]) plt.show() print("lar error: " + str(np.mean(error_lars))) print("ridge error: " + str(np.mean(error))) print("lasso error: " + str(np.mean(error_lasso))) #print(movie_ids) #print(np.array(movie_features[:, 0], int) == movie_ids) features_train = movie_features[np.array(movie_ids, int) - 1] features_train[:, 0] = 1.0 #for i in range(1, features_train.shape[1]): # features_train[:, i] = (features_train[:, i] - np.mean(features_train[:, i]))/(np.std(features_train[:, i])) features_train = (features_train - np.mean(features_train))/np.std(features_train) clf = Ridge(alpha = 0.1, normalize=True, fit_intercept = True) pred = clf.fit(features_train, ratings_train).predict(features_train) reg = Lars(n_nonzero_coefs = np.inf) #pred = reg.fit(features_train, ratings_train).predict(features_train) error = (pred - ratings_train)**2 plt.plot(error) plt.show()
# LARS Regression # The Least Angle Regression (LARS) method is a computationally efficient algorithm for fitting a regression model. # It is useful for highdimensional data and is commonly used in conjunction with regularization (such as LASSO). import numpy as np from sklearn import datasets from sklearn.linear_model import Lars # load the diabetes datasets dataset = datasets.load_diabetes() # fit a LARS model to the data model = Lars() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted-expected)**2) print(mse) print(model.score(dataset.data, dataset.target))
#LarsCV: fit_intercept, verbose, normalize, cv from sklearn.linear_model import LarsCV, Lars from sklearn.datasets import make_regression import matplotlib.pyplot as plt X, y = make_regression(n_samples=200, noise=4.0, random_state=0) reg = LarsCV(cv=5).fit(X, y) reg.score(X, y) reg.alpha_ pred = reg.predict(X[:, ]) plt.scatter(X[:, 0], y, color='black') plt.scatter(X[:, 0], pred, color='red') plt.show() reg2 = Lars().fit(X, y) reg2.score(X, y) reg2.alpha_ pred = reg2.predict(X[:, ]) #%% LassoLars: alpha, fit_intercept, normalize #LassoLarsCV: alpha, fit_intercept, normalize, cv from sklearn import linear_model reg = linear_model.LassoLars(alpha=0.01) reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1]) print(reg.coef_) reg2 = linear_model.LassoLarsCV() reg2.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1])
# In[3]: from sklearn.linear_model import Lasso # AdaptiveLasso找不到 # LASSO回归的特点是在拟合广义线性模型的同时进行变量筛选和复杂度调整。 因此,不论目标因变量是连续的,还是二元或者多元离散的, #都可以用LASSO回归建模然后预测。 这里的变量筛选是指不把所有的变量都放入模型中进行拟合,而是有选择的把变量放入模型从而得到更好的性能参数。 model = Lasso(alpha=0.1) model.fit(data[['x1', 'x2', 'x3', 'x4', 'x5', 'x7']], data['y']) # data.iloc[:, 0:13] print model.coef_ # 各个特征的系数 print model.intercept_ # In[4]: from sklearn.linear_model import Lars #最小角回归 model1 = Lars(n_nonzero_coefs=7) model1.fit(data.iloc[:, 0:13], data['y']) print model1.coef_ # 各个特征的系数 # In[5]: # 确定最合适的Alpha from sklearn.linear_model import LarsCV #交叉验证最小二乘法回归模型 model1 = LarsCV() model1.fit(data.iloc[:, 0:13], data['y']) print model1.coef_ # 各个特征的系数 print model1.alpha_ # In[6]: from sklearn.linear_model import LassoCV #交叉验证最小二乘法回归模型 model1 = LassoCV()
def fit(self): # 1. construct a placeholder called 'qhat_k_container' for the list of all q_hat^k (defined in Algorithm 2) of each subsample qhat_k_container = list() # 2. estimate q_hat^k (for the solution path) on each subsample and save them as elements of the placeholder for j in range(self.n_repeat): # a. randomly choose a subset of sample points (whose index is 'index_subsample') that is used to generate a subsample in each repeat index_subsample = np.random.choice(self.train_size, self.subsample_size, replace=False) # b. based on 'index_subsample', take the corresponding observations of X out and save them as the subample X_subsample = self.X_so[index_subsample] # c. based on 'index_subsample', take the corresponding observations of Y out and save them as the subample y_subsample = self.y_so[index_subsample] # d. scikit-learn requires 'y_subsample' to be an one-dimension array y_subsample.shape = (y_subsample.shape[0], ) # e. given a subsample, compute q_hat^k (the solution path) using lars # e(1). call the class 'Lars' trial_1 = Lars(n_nonzero_coefs=min(X_subsample.shape[1] + 1, X_subsample.shape[0] + 1)) # e(2). fit lars on the subsample trial_1.fit(X_subsample, y_subsample) # e(3). save the active set of lars (indices of variables select by lars) as 'active'. active = trial_1.active_ # f. The active set of lars is ranked based on the chronology of variable inclusion at different stages of lars. For example [2,1,3] means x_2 is included at stage 1, x_1 is included at stage 2 and x_3 is included at stage 3. Based on the active set of lars, we compute q_hat^k (defined as 'qhat_k' in code) as defined in Algorithm 2 # f(1). we generate 'qhat_k' as an array of zeros; qhat_k = np.zeros((1, self.n_dim)) # f(2). we compute the i-th value of q_hat^k for the corresponding variable based on Algorithm 2; replace i-th term in 'qhat_k' with the value we just compute for i in active: qhat_k[0, i] = 1 - \ (np.where(np.array(active) == i)[0][0]) / (self.n_dim) # f(3). we append the result into 'qhat_k_container' as one element of the list qhat_k_container.append(qhat_k) # 3. if self.lasso == True, we compute CV-lars-lasso and CV-cd on the original sample X and Y (not on the subsample) if (self.lasso == True): # a(1). call the class for CV-lars-lasso (called LassoLarsCV in Scikit-learn) # a(2). we set the number of folds in CV as 10 trial_2 = LassoLarsCV(cv=10) # b. change y into one-dimensional array (required by Scikit-learn) yy = self.y yy.shape = (self.sample_size, ) # c. fit CV-lars-lasso on X and Y trial_2.fit(self.X, yy) # d. save 'la_list' as the number of variables in the active set of CV-lars-lasso la_list = len(trial_2.active_) # e. save 'la_vari_list' as the active set of CV-lars-lasso la_vari_list = trial_2.active_ # f. call the class for CV-cd (called LassoCV in Scikit-learn) # f(1). we set the number of folds in CV as 10 # f(2). for reproduction, we fix the random seed of training-validation split in CV (random_state=0) trial_3 = LassoCV(cv=10, random_state=0) # g. fit cv-cd on X and Y trial_3.fit(self.X, yy) # h. save 'cd_list' as the number of variables in the active set of CV-cd cd_list = np.count_nonzero(trial_3.coef_) # i. save 'cd_vari_list' as the active set of CV-cd cd_vari_list = np.nonzero(trial_3.coef_)[0] # 4. compute q_hat and Q(c) (defined in Algorithm 2) # a(1). we transform the list of all q_hat^k ('qhat_k_container') into a matrix ('qhat_k_container_matrix') # a(2). row of the matrix: the q_hat^k on a given subsample for all variables # a(3). colum of the matrix: the corresponding value of q_hat^k for a given variable on all subsamples qhat_k_container_matrix = np.concatenate(qhat_k_container, axis=0) # b. compute the the value of qhat for each variable (qhat defined in Algorithm 2 of the paper) qhat_value = np.mean(qhat_k_container_matrix, axis=0) # c. set 'Qc_list' as the container of Q(c) for all value of c Qc_list = list() # d. set 'c_seq' as the sequence of c for the grid search of c* in solar c_seq = np.arange(max(qhat_value), 0.1, self.step_size) # e. generate Q(c) for each value of c for j in c_seq: # e(1). define 'container' as the placeholder of Q(c) when c == j; container = list() for i in range(self.X.shape[1]): # e(2). include all variables into 'container' if their corresponding values in q-hat are larger or equal to j; if (qhat_value[i] >= j): container.append(i) # e(3). append 'container' (Q(c) when c == j) into 'Qc_list' (the container of Q(c) for all value of c); Qc_list.append(container) # 5. compute the test error of each value of c # we use grid search on test set to choose c*; # for each value of c in the grid search, train a OLS of Y_so on the variables of Q(c) in X_so (Y_so and X_so defined at the begining); # a. container for test errors test_error = list() # b. compute the test error of each Q(c) on test set # b(0). set i as the indices of all variables in Q(c) for a given value of c; for i in Qc_list: # b(1). call the LinearRegression class; OLS_1 = LinearRegression() # b(2). compute OLS of Y_so on the variables in Q(c) in X_so; OLS_1.fit(self.X_so[:, i], self.y_so) # b(3). compute the L2 prediction error of OLS on test set (X_test, y_test); s1 = costs_com(self.X_test[:, i], self.y_test, OLS_1) loss_test_1, _ = s1.L2() # b(4). save the L2 error as the test error of Q(c) for each value of c; append it into the container of test errors; test_error.append(loss_test_1) # 6. tuning c via grid search # 6(a). transform 'test_error' from a list into an array; test_error = np.asarray(test_error) # 6(b). save the location of minimum of 'test_error' as 'min_loc_val'; min_loc_val = np.where(test_error == min(test_error))[0] # 6(c). save the correpsonding value of c (c*) as 'opt_c'; opt_c = c_seq[min_loc_val] # 6(d). find Q(c*) and save it as 'Q_opt_c'; Q_opt_c = Qc_list[max(min_loc_val)] # 7. Regression of Y onto the selected variables ( Q(c*) ) in X # 7(a). call the LinearRegression class; OLS_2 = LinearRegression() # 7(b). fit OLS of Y on the variables of Q(c*) in X; OLS_2.fit(self.X[:, Qc_list[max(min_loc_val)]], self.y) # 7(c). set 'solar_coef' (an array of zeros) as the placeholder of solar regression coefficents solar_coef = np.zeros([self.n_dim, 1]) # 7(d). put the estimated regression coefficents into their corresponding place of 'solar_coef' solar_coef[Q_opt_c, 0] = OLS_2.coef_ # 8. define la_list, la_vari_list as empty list if self.lasso != True (if we don't want to compute cv-lars-lasso and cv-cd) if (self.lasso != True): la_list = [] la_vari_list = [] cd_list = [] cd_vari_list = [] return solar_coef, opt_c, test_error, Qc_list, la_list, la_vari_list, cd_list, cd_vari_list
# furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # # Author: Quan Pan <*****@*****.**> # License: MIT License # Create: 2016-12-02 from sklearn.linear_model import Lars # X = [[0., 0.], [1., 1.], [10., 10.]] X = [[0.0], [1.0], [10.0]] y = [0.0, 1.0, 10.0] # x_preb = [[5., 5.], [-10., -10.]] x_preb = [[5.], [-10.]] clf = Lars(n_nonzero_coefs=1) clf.fit(X, y) print(clf.coef_) y_pred = clf.predict(x_preb) print y_pred
def Lar_regr(features, labels): from sklearn.linear_model import Lars model = Lars() model.fit(features, labels) pred = model.predict(features) AsGraph(labels, pred)
#!/usr/bin/env python ''' Input variables: - X_TRAIN: path of a numpy array with x. - Y_TRAIN: path of a numpy array with y. - C: number of features to select. Output files: - features_lars.npy: numpy array with the 0-based index of the selected features. ''' import numpy as np from sklearn.linear_model import Lars from sklearn.feature_selection import SelectFromModel x_train = np.load('${X_TRAIN}') y_train = np.load('${Y_TRAIN}') clf = Lars(n_nonzero_coefs = ${C}) clf.fit(x_train, y_train) sfm = SelectFromModel(clf, prefit = True) features = np.where(sfm.get_support())[0] np.save('features_lars.npy', features)
lasso_gamma = np.array([[0. if abs(x) < 1e-100 else 1. for x in lasso.coef_]]).T # P = lambda X: lasso.predict(X) lasso_predictor = PredictorWrapper.PredictorWrapper(lasso_beta,lasso_gamma,lasso.predict) dill.dump(lasso_predictor,open('%sLASSO.p' % logDir,'wb')) with open(logFile,'a') as f: f.write('Lasso c: %15.10f alpha: %15.10f\n' % (1./(2.* X_tr.shape[0]), optLam)) ############## ## LARS_SET ## ############## kappa = [2,4,10] for k in kappa: lars = Lars(n_nonzero_coefs=k,fit_intercept=False) lars.fit(X_tr,y_tr) lars_beta = np.array([lars.coef_]).T lars_gamma = np.zeros((X_tr.shape[1],1)) lars_gamma[lars.active_] = 1. lars_predictor = PredictorWrapper.PredictorWrapper(lars_beta,lars_gamma,lars.predict) dill.dump(lars_predictor,open('%sLARS_%02d.p' % (logDir,k),'wb')) ############## ## LARS_OPT ## ############## larsKappas = np.linspace(0,40,41,dtype=int) def larsEval(learned): learned_yhat = np.array([learned.predict(X_val)]).T learned_mse = sum((y_val - learned_yhat) ** 2)[0] return learned_mse
from sklearn.linear_model import Lars from sklearn.linear_model import LarsCV # データロード reg_data, reg_target = make_regression(n_samples=200, n_features=500, n_informative=10, noise=2) # 1 LARSによる学習 -------------------------------------------------------------------------- # インスタンス生成 lars = Lars(n_nonzero_coefs=10) # 学習 lars.fit(reg_data, reg_target) # 比ゼロ係数の数 np.sum(lars.coef_ != 0) # 2 LARSモデルの実行比較 --------------------------------------------------------------------- # <ポイント> # - データを半分に取り分けてLARSモデルで学習 # 変数定義 # --- 訓練データ数 train_n = 100 # インスタンス生成と学習 # --- 非ゼロ係数の数を12個とする
def sparse_encode(X, dictionary, algorithm='mp', fit_tol=None, P_cum=None, l0_sparseness=10, C=0., do_sym=True, verbose=0): """Generic sparse coding Each column of the result is the solution to a sparse coding problem. Parameters ---------- X : array of shape (n_samples, n_pixels) Data matrix. dictionary : array of shape (n_dictionary, n_pixels) The dictionary matrix against which to solve the sparse coding of the data. Some of the algorithms assume normalized rows. algorithm : {'mp', 'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'} mp : Matching Pursuit lars: uses the least angle regression method (linear_model.lars_path) lasso_lars: uses Lars to compute the Lasso solution lasso_cd: uses the coordinate descent method to compute the Lasso solution (linear_model.Lasso). lasso_lars will be faster if the estimated dictionary are sparse. omp: uses orthogonal matching pursuit to estimate the sparse solution threshold: squashes to zero all coefficients less than regularization from the projection dictionary * data' max_iter : int, 1000 by default Maximum number of iterations to perform if `algorithm='lasso_cd'`. verbose : int Controls the verbosity; the higher, the more messages. Defaults to 0. Returns ------- code : array of shape (n_samples, n_dictionary) The sparse codes """ if X.ndim == 1: X = X[:, np.newaxis] #n_samples, n_pixels = X.shape if algorithm == 'lasso_lars': alpha = float(regularization) / n_pixels # account for scaling from sklearn.linear_model import LassoLars # Not passing in verbose=max(0, verbose-1) because Lars.fit already # corrects the verbosity level. cov = np.dot(dictionary, X.T) lasso_lars = LassoLars(alpha=fit_tol, fit_intercept=False, verbose=verbose, normalize=False, precompute=None, fit_path=False) lasso_lars.fit(dictionary.T, X.T, Xy=cov) sparse_code = lasso_lars.coef_.T elif algorithm == 'lasso_cd': alpha = float(regularization) / n_pixels # account for scaling # TODO: Make verbosity argument for Lasso? # sklearn.linear_model.coordinate_descent.enet_path has a verbosity # argument that we could pass in from Lasso. from sklearn.linear_model import Lasso clf = Lasso(alpha=fit_tol, fit_intercept=False, normalize=False, precompute=None, max_iter=max_iter, warm_start=True) if init is not None: clf.coef_ = init clf.fit(dictionary.T, X.T, check_input=check_input) sparse_code = clf.coef_.T elif algorithm == 'lars': # Not passing in verbose=max(0, verbose-1) because Lars.fit already # corrects the verbosity level. from sklearn.linear_model import Lars cov = np.dot(dictionary, X.T) lars = Lars(fit_intercept=False, verbose=verbose, normalize=False, precompute=None, n_nonzero_coefs=l0_sparseness, fit_path=False) lars.fit(dictionary.T, X.T, Xy=cov) sparse_code = lars.coef_.T elif algorithm == 'threshold': cov = np.dot(dictionary, X.T) sparse_code = ((np.sign(cov) * np.maximum(np.abs(cov) - regularization, 0))).T elif algorithm == 'omp': # TODO: Should verbose argument be passed to this? from sklearn.linear_model import orthogonal_mp_gram from sklearn.utils.extmath import row_norms cov = np.dot(dictionary, X.T) gram = np.dot(dictionary, dictionary.T) sparse_code = orthogonal_mp_gram(Gram=gram, Xy=cov, n_nonzero_coefs=l0_sparseness, tol=None, norms_squared=row_norms(X, squared=True), copy_Xy=False).T elif algorithm == 'mp': sparse_code = mp(X, dictionary, l0_sparseness=l0_sparseness, fit_tol=fit_tol, P_cum=P_cum, C=C, do_sym=do_sym, verbose=verbose) else: raise ValueError( 'Sparse coding method must be "mp", "lasso_lars" ' '"lasso_cd", "lasso", "threshold" or "omp", got %s.' % algorithm) return sparse_code
## ridge回归 ridge = Ridge(alpha=0.8) ridge.fit(train_X, train_y) predictions = ridge.predict(test_X) print('MAE is ', mean_absolute_error(np.expm1(predictions), np.expm1(test_y))) ## lasso回归 lasso = Lasso(alpha=0.9) lasso.fit(train_X, train_y) predictions = lasso.predict(test_X) print('MAE is ', mean_absolute_error(np.expm1(predictions), np.expm1(test_y))) ## 最小角回归 lars = Lars(n_nozero_coefs=100) lars.fit(train_X, train_y) predictions = lars.predict(test_X) print('MAE is ', mean_absolute_error(np.expm1(predictions), np.expm1(test_y))) ## 线性回归 lr = LinearRegression() lr.fit(train_X, train_y) predictions = lr.predict(train_X, train_y) print('MAE is ', mean_absolute_error(np.expm1(predictions), np.expm1(test_y))) ## 决策树回归 dtr = DecisionTreeRegressor(criterion='mae', max_depth=5, min_samples_split=4, max_features='sqrt', min_samples_leaf=2)